diff --git a/.github/workflows/pascal.yaml b/.github/workflows/pascal.yaml index 4f0fb013..d44f0f4e 100644 --- a/.github/workflows/pascal.yaml +++ b/.github/workflows/pascal.yaml @@ -111,20 +111,36 @@ jobs: ls -lh install/lib/ if [[ ${{ matrix.os }} == 'windows-latest' ]]; then - cp -v install/lib/*.dll ../pascal-api-examples/read-wav - cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr cp -v install/lib/*.dll ../pascal-api-examples/non-streaming-asr + cp -v install/lib/*.dll ../pascal-api-examples/read-wav + cp -v install/lib/*.dll ../pascal-api-examples/speaker-diarization + cp -v install/lib/*.dll ../pascal-api-examples/speech-enhancement-gtcrn + cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr + cp -v install/lib/*.dll ../pascal-api-examples/tts cp -v install/lib/*.dll ../pascal-api-examples/vad cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr - cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/read-wav - cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/streaming-asr cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/non-streaming-asr + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/read-wav + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/speaker-diarization + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/speech-enhancement-gtcrn + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/streaming-asr + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr - cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts fi + - name: Run Speech Enhancement test (GTCRN) + shell: bash + run: | + export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH + + cd ./pascal-api-examples + + pushd speech-enhancement-gtcrn + ./run-gtcrn.sh + ls -lh + - name: Run Pascal test (Non Streaming ASR) shell: bash run: | diff --git a/.github/workflows/test-go.yaml b/.github/workflows/test-go.yaml index a8708bf9..b1909494 100644 --- a/.github/workflows/test-go.yaml +++ b/.github/workflows/test-go.yaml @@ -99,11 +99,21 @@ jobs: cp -v ./install/lib/sherpa-onnx-c-api.dll ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/ cp -v ./install/lib/onnxruntime.dll ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/ ls -lh ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/ - cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/speaker-identification/ - cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/streaming-hlg-decoding/ - cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/non-streaming-tts/ + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/add-punctuation + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/audio-tagging + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/keyword-spotting-from-file/ cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/non-streaming-decode-files/ + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/non-streaming-speaker-diarization/ + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/non-streaming-tts/ + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/speaker-identification/ + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/speech-enhancement-gtcrn cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/streaming-decode-files/ + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/streaming-hlg-decoding/ + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/vad + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/vad-asr-paraformer + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/vad-asr-whisper + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/vad-speaker-identification + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/vad-spoken-language-identification cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll $upload_dir else diff --git a/pascal-api-examples/speech-enhancement-gtcrn/.gitignore b/pascal-api-examples/speech-enhancement-gtcrn/.gitignore new file mode 100644 index 00000000..92e32999 --- /dev/null +++ b/pascal-api-examples/speech-enhancement-gtcrn/.gitignore @@ -0,0 +1 @@ +gtcrn diff --git a/pascal-api-examples/speech-enhancement-gtcrn/gtcrn.pas b/pascal-api-examples/speech-enhancement-gtcrn/gtcrn.pas new file mode 100644 index 00000000..6f60088b --- /dev/null +++ b/pascal-api-examples/speech-enhancement-gtcrn/gtcrn.pas @@ -0,0 +1,43 @@ +{ Copyright (c) 2025 Xiaomi Corporation } +{ +This file shows how to use the speech enhancement API from sherpa-onnx + +Please first download files used in this script before you run it. + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/inp_16k.wav +} +program main; + +{$mode delphi} + +uses + sherpa_onnx, + SysUtils; + +var + Wave: TSherpaOnnxWave; + + Config: TSherpaOnnxOfflineSpeechDenoiserConfig; + Sd: TSherpaOnnxOfflineSpeechDenoiser; + Audio: TSherpaOnnxDenoisedAudio; +begin + Wave := SherpaOnnxReadWave('./inp_16k.wav'); + + Initialize(Config); + + Config.Model.Gtcrn.Model := './gtcrn_simple.onnx'; + Config.Model.NumThreads:= 1; + Config.Model.Debug:= True; + Config.Model.Provider:= 'cpu'; + + Sd := TSherpaOnnxOfflineSpeechDenoiser.Create(Config); + + Audio := Sd.Run(Wave.Samples, Wave.SampleRate); + + SherpaOnnxWriteWave('./enhanced-16k.wav', Audio.Samples, Audio.SampleRate); + WriteLn('Saved to ./enhanced-16k.wav'); + + FreeAndNil(Sd); +end. + diff --git a/pascal-api-examples/speech-enhancement-gtcrn/run-gtcrn.sh b/pascal-api-examples/speech-enhancement-gtcrn/run-gtcrn.sh new file mode 100755 index 00000000..896e8de3 --- /dev/null +++ b/pascal-api-examples/speech-enhancement-gtcrn/run-gtcrn.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +if [ ! -f ./gtcrn_simple.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx +fi + +if [ ! -f ./inp_16k.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/inp_16k.wav +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./gtcrn.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./gtcrn + diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index ea03ab0f..e2ad34e8 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -515,6 +515,44 @@ type property GetSampleRate: Integer Read SampleRate; end; + TSherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig = record + Model: AnsiString; + function ToString: AnsiString; + end; + + TSherpaOnnxOfflineSpeechDenoiserModelConfig = record + Gtcrn: TSherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig; + NumThreads: Integer; + Debug: Boolean; + Provider: AnsiString; + function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeechDenoiserModelConfig); + end; + + TSherpaOnnxOfflineSpeechDenoiserConfig = record + Model: TSherpaOnnxOfflineSpeechDenoiserModelConfig; + function ToString: AnsiString; + end; + + TSherpaOnnxDenoisedAudio = record + Samples: array of Single; + SampleRate: Integer; + end; + + TSherpaOnnxOfflineSpeechDenoiser = class + private + Handle: Pointer; + SampleRate: Integer; + _Config: TSherpaOnnxOfflineSpeechDenoiserConfig; + public + constructor Create(Config: TSherpaOnnxOfflineSpeechDenoiserConfig); + destructor Destroy; override; + + function Run(Samples: array of Single; InputSampleRate: Integer): TSherpaOnnxDenoisedAudio; + + property GetHandle: Pointer Read Handle; + property GetSampleRate: Integer Read SampleRate; + end; { It supports reading a single channel wave with 16-bit encoded samples. Samples are normalized to the range [-1, 1]. @@ -851,6 +889,31 @@ type PSherpaOnnxOfflineSpeakerDiarizationConfig = ^SherpaOnnxOfflineSpeakerDiarizationConfig; + SherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig = record + Model: PAnsiChar; + end; + + SherpaOnnxOfflineSpeechDenoiserModelConfig = record + Gtcrn: SherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig; + NumThreads: cint32; + Debug: cint32; + Provider: PAnsiChar; + end; + + SherpaOnnxOfflineSpeechDenoiserConfig = record + Model: SherpaOnnxOfflineSpeechDenoiserModelConfig; + end; + + PSherpaOnnxOfflineSpeechDenoiserConfig = ^SherpaOnnxOfflineSpeechDenoiserConfig; + + SherpaOnnxDenoisedAudio = record + Samples: pcfloat; + N: cint32; + SampleRate: cint32; + end; + + PSherpaOnnxDenoisedAudio = ^SherpaOnnxDenoisedAudio; + function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32; SampleRateOutHz: cint32; FilterCutoffHz: cfloat; @@ -872,6 +935,22 @@ procedure SherpaOnnxLinearResamplerResampleFree(P: PSherpaOnnxResampleOut); cdec procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl; external SherpaOnnxLibName; +function SherpaOnnxCreateOfflineSpeechDenoiser(Config: PSherpaOnnxOfflineSpeechDenoiserConfig): Pointer; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxDestroyOfflineSpeechDenoiser(P: Pointer); cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxOfflineSpeechDenoiserGetSampleRate(P: Pointer): cint32; cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxOfflineSpeechDenoiserRun(P: Pointer; + Samples: pcfloat; N: cint32;SampleRate: cint32):PSherpaOnnxDenoisedAudio; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxDestroyDenoisedAudio(Audio: Pointer); cdecl; + external SherpaOnnxLibName; + function SherpaOnnxCreateOfflineSpeakerDiarization(Config: PSherpaOnnxOfflineSpeakerDiarizationConfig): Pointer; cdecl; external SherpaOnnxLibName; @@ -2358,4 +2437,79 @@ begin SherpaOnnxOfflineSpeakerDiarizationDestroyResult(R); end; +function TSherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig(' + + 'Model := %s)', [Self.Model]); +end; + +function TSherpaOnnxOfflineSpeechDenoiserModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineSpeechDenoiserModelConfig(' + + 'Gtcrn := %s, '+ + 'NumThreads := %d, '+ + 'Debug := %s, '+ + 'Provider := %s)', + [Self.Gtcrn.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider]); +end; + +class operator TSherpaOnnxOfflineSpeechDenoiserModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeechDenoiserModelConfig); +begin + Dest.NumThreads := 1; + Dest.Debug := False; + Dest.Provider := 'cpu'; +end; + +function TSherpaOnnxOfflineSpeechDenoiserConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineSpeechDenoiserConfig(' + + 'Model := %s)', [Self.Model.ToString]); +end; + +constructor TSherpaOnnxOfflineSpeechDenoiser.Create(Config: TSherpaOnnxOfflineSpeechDenoiserConfig); +var + C: SherpaOnnxOfflineSpeechDenoiserConfig; +begin + C := Default(SherpaOnnxOfflineSpeechDenoiserConfig); + C.Model.Gtcrn.Model := PAnsiChar(Config.Model.Gtcrn.Model); + C.Model.NumThreads := Config.Model.NumThreads; + C.Model.Debug := Ord(Config.Model.Debug); + C.Model.Provider := PAnsiChar(Config.Model.Provider); + + Self.Handle := SherpaOnnxCreateOfflineSpeechDenoiser(@C); + Self._Config := Config; + Self.SampleRate := 0; + + if Self.Handle <> nil then + begin + Self.SampleRate := SherpaOnnxOfflineSpeechDenoiserGetSampleRate(Self.Handle); + end; +end; + +destructor TSherpaOnnxOfflineSpeechDenoiser.Destroy; +begin + SherpaOnnxDestroyOfflineSpeechDenoiser(Self.Handle); + Self.Handle := nil; +end; + +function TSherpaOnnxOfflineSpeechDenoiser.Run(Samples: array of Single; InputSampleRate: Integer): TSherpaOnnxDenoisedAudio; +var + Audio: PSherpaOnnxDenoisedAudio; + I: Integer; +begin + Result := Default(TSherpaOnnxDenoisedAudio); + + Audio := SherpaOnnxOfflineSpeechDenoiserRun(Self.Handle, pcfloat(Samples), Length(Samples), InputSampleRate); + + SetLength(Result.Samples, Audio^.N); + Result.SampleRate := Audio^.SampleRate; + + for I := Low(Result.Samples) to High(Result.Samples) do + begin + Result.Samples[I] := Audio^.Samples[I]; + end; + + SherpaOnnxDestroyDenoisedAudio(audio); +end; + end.