Add Pascal API for Moonshine models (#1482)

2024-10-27 12:21:16 +08:00
parent 54468a7370
commit cdd8e1bbcb
8 changed files with 354 additions and 3 deletions
--- a/.github/workflows/pascal.yaml
+++ b/.github/workflows/pascal.yaml
@@ -165,6 +165,10 @@ jobs:
          cd ./pascal-api-examples

          pushd vad-with-non-streaming-asr
+          time ./run-vad-with-moonshine.sh
+          rm -rf sherpa-onnx-*
+          echo "---"
+
          time ./run-vad-with-whisper.sh
          rm -rf sherpa-onnx-*
          echo "---"
@@ -220,6 +224,10 @@ jobs:
          rm -rf sherpa-onnx-*
          echo "---"

+          ./run-moonshine.sh
+          rm -rf sherpa-onnx-*
+          echo "---"
+
          ./run-whisper.sh
          rm -rf sherpa-onnx-*
          echo "---"
--- a/pascal-api-examples/non-streaming-asr/.gitignore
+++ b/pascal-api-examples/non-streaming-asr/.gitignore
@@ -7,3 +7,4 @@ paraformer
 paraformer_itn
 sense_voice
 telespeech_ctc
+moonshine
--- a/pascal-api-examples/non-streaming-asr/moonshine.pas
+++ b/pascal-api-examples/non-streaming-asr/moonshine.pas
@@ -0,0 +1,80 @@
+{ Copyright (c)  2024  Xiaomi Corporation }
+
+{
+This file shows how to use a non-streaming Moonshine model
+to decode files.
+
+You can download the model files from
+https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+}
+
+program moonshine;
+
+{$mode objfpc}
+
+uses
+  sherpa_onnx,
+  DateUtils,
+  SysUtils;
+
+var
+  Wave: TSherpaOnnxWave;
+  WaveFilename: AnsiString;
+
+  Config: TSherpaOnnxOfflineRecognizerConfig;
+  Recognizer: TSherpaOnnxOfflineRecognizer;
+  Stream: TSherpaOnnxOfflineStream;
+  RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
+
+  Start: TDateTime;
+  Stop: TDateTime;
+
+  Elapsed: Single;
+  Duration: Single;
+  RealTimeFactor: Single;
+begin
+  Initialize(Config);
+
+  Config.ModelConfig.Moonshine.Preprocessor := './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx';
+  Config.ModelConfig.Moonshine.Encoder := './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx';
+  Config.ModelConfig.Moonshine.UncachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx';
+  Config.ModelConfig.Moonshine.CachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx';
+
+  Config.ModelConfig.Tokens := './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt';
+  Config.ModelConfig.Provider := 'cpu';
+  Config.ModelConfig.NumThreads := 1;
+  Config.ModelConfig.Debug := False;
+
+  WaveFilename := './sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav';
+
+  Wave := SherpaOnnxReadWave(WaveFilename);
+
+  Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
+  Stream := Recognizer.CreateStream();
+  Start := Now;
+
+  Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
+  Recognizer.Decode(Stream);
+
+  RecognitionResult := Recognizer.GetResult(Stream);
+
+  Stop := Now;
+
+  Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
+  Duration := Length(Wave.Samples) / Wave.SampleRate;
+  RealTimeFactor := Elapsed / Duration;
+
+  WriteLn(RecognitionResult.ToString);
+  WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
+  WriteLn(Format('Elapsed %.3f s', [Elapsed]));
+  WriteLn(Format('Wave duration %.3f s', [Duration]));
+  WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));
+
+  {Free resources to avoid memory leak.
+
+  Note: You don't need to invoke them for this simple script.
+  However, you have to invoke them in your own large/complex project.
+  }
+  FreeAndNil(Stream);
+  FreeAndNil(Recognizer);
+end.
--- a/pascal-api-examples/non-streaming-asr/run-moonshine.sh
+++ b/pascal-api-examples/non-streaming-asr/run-moonshine.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+
+echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+
+if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+  mkdir -p ../../build
+  pushd ../../build
+  cmake \
+    -DCMAKE_INSTALL_PREFIX=./install \
+    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+    -DBUILD_SHARED_LIBS=ON \
+    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+    ..
+
+  cmake --build . --target install --config Release
+  ls -lh lib
+  popd
+fi
+
+if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+  tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+  rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+fi
+
+fpc \
+  -dSHERPA_ONNX_USE_SHARED_LIBS \
+  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+  -Fl$SHERPA_ONNX_DIR/build/install/lib \
+  ./moonshine.pas
+
+export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+
+./moonshine
--- a/pascal-api-examples/vad-with-non-streaming-asr/.gitignore
+++ b/pascal-api-examples/vad-with-non-streaming-asr/.gitignore
@@ -1,3 +1,4 @@
 !run-*.sh
 vad_with_whisper
 vad_with_sense_voice
+vad_with_moonshine
--- a/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-moonshine.sh
+++ b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-moonshine.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+
+echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+
+if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+  mkdir -p ../../build
+  pushd ../../build
+  cmake \
+    -DCMAKE_INSTALL_PREFIX=./install \
+    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+    -DBUILD_SHARED_LIBS=ON \
+    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+    ..
+
+  cmake --build . --target install --config Release
+  popd
+fi
+
+if [[ ! -f ./silero_vad.onnx ]]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+fi
+
+if [ ! -f ./Obama.wav ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
+fi
+
+if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+  tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+  rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+fi
+
+fpc \
+  -dSHERPA_ONNX_USE_SHARED_LIBS \
+  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+  -Fl$SHERPA_ONNX_DIR/build/install/lib \
+  ./vad_with_moonshine.pas
+
+export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+
+./vad_with_moonshine
--- a/pascal-api-examples/vad-with-non-streaming-asr/vad_with_moonshine.pas
+++ b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_moonshine.pas
@@ -0,0 +1,139 @@
+{ Copyright (c)  2024  Xiaomi Corporation }
+
+{
+This file shows how to use a non-streaming Moonshine model
+with silero VAD to decode files.
+
+You can download the model files from
+https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+}
+
+program vad_with_moonshine;
+
+{$mode objfpc}
+
+uses
+  sherpa_onnx,
+  SysUtils;
+
+function CreateVad(): TSherpaOnnxVoiceActivityDetector;
+var
+  Config: TSherpaOnnxVadModelConfig;
+
+  SampleRate: Integer;
+  WindowSize: Integer;
+begin
+  Initialize(Config);
+
+  SampleRate := 16000; {Please don't change it unless you know the details}
+  WindowSize := 512; {Please don't change it unless you know the details}
+
+  Config.SileroVad.Model := './silero_vad.onnx';
+  Config.SileroVad.MinSpeechDuration := 0.5;
+  Config.SileroVad.MinSilenceDuration := 0.5;
+  Config.SileroVad.Threshold := 0.5;
+  Config.SileroVad.WindowSize := WindowSize;
+  Config.NumThreads:= 1;
+  Config.Debug:= True;
+  Config.Provider:= 'cpu';
+  Config.SampleRate := SampleRate;
+
+  Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
+end;
+
+function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
+var
+  Config: TSherpaOnnxOfflineRecognizerConfig;
+begin
+  Initialize(Config);
+
+  Config.ModelConfig.Moonshine.Preprocessor := './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx';
+  Config.ModelConfig.Moonshine.Encoder := './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx';
+  Config.ModelConfig.Moonshine.UncachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx';
+  Config.ModelConfig.Moonshine.CachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx';
+
+  Config.ModelConfig.Tokens := './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt';
+  Config.ModelConfig.Provider := 'cpu';
+  Config.ModelConfig.NumThreads := 1;
+  Config.ModelConfig.Debug := False;
+
+  Result := TSherpaOnnxOfflineRecognizer.Create(Config);
+end;
+
+var
+  Wave: TSherpaOnnxWave;
+
+  Recognizer: TSherpaOnnxOfflineRecognizer;
+  Vad: TSherpaOnnxVoiceActivityDetector;
+
+  Offset: Integer;
+  WindowSize: Integer;
+  SpeechSegment: TSherpaOnnxSpeechSegment;
+
+  Start: Single;
+  Duration: Single;
+
+  Stream: TSherpaOnnxOfflineStream;
+  RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
+begin
+  Vad := CreateVad();
+  Recognizer := CreateOfflineRecognizer();
+
+  Wave := SherpaOnnxReadWave('./Obama.wav');
+  if Wave.SampleRate <> Vad.Config.SampleRate then
+    begin
+      WriteLn(Format('Expected sample rate: %d. Given: %d',
+        [Vad.Config.SampleRate, Wave.SampleRate]));
+
+      Exit;
+    end;
+
+  WindowSize := Vad.Config.SileroVad.WindowSize;
+  Offset := 0;
+  while Offset + WindowSize <= Length(Wave.Samples) do
+    begin
+      Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
+      Offset += WindowSize;
+
+      while not Vad.IsEmpty do
+        begin
+          SpeechSegment := Vad.Front();
+          Vad.Pop();
+          Stream := Recognizer.CreateStream();
+
+          Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
+          Recognizer.Decode(Stream);
+          RecognitionResult := Recognizer.GetResult(Stream);
+
+          Start := SpeechSegment.Start / Wave.SampleRate;
+          Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
+          WriteLn(Format('%.3f -- %.3f %s',
+            [Start, Start + Duration, RecognitionResult.Text]));
+
+          FreeAndNil(Stream);
+        end;
+    end;
+
+  Vad.Flush;
+
+  while not Vad.IsEmpty do
+    begin
+      SpeechSegment := Vad.Front();
+      Vad.Pop();
+      Stream := Recognizer.CreateStream();
+
+      Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
+      Recognizer.Decode(Stream);
+      RecognitionResult := Recognizer.GetResult(Stream);
+
+      Start := SpeechSegment.Start / Wave.SampleRate;
+      Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
+      WriteLn(Format('%.3f -- %.3f %s',
+        [Start, Start + Duration, RecognitionResult.Text]));
+
+      FreeAndNil(Stream);
+    end;
+
+  FreeAndNil(Recognizer);
+  FreeAndNil(Vad);
+end.
--- a/sherpa-onnx/pascal-api/sherpa_onnx.pas
+++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas
@@ -250,6 +250,14 @@ type
    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig);
  end;

+  TSherpaOnnxOfflineMoonshineModelConfig = record
+    Preprocessor: AnsiString;
+    Encoder: AnsiString;
+    UncachedDecoder: AnsiString;
+    CachedDecoder: AnsiString;
+    function ToString: AnsiString;
+  end;
+
  TSherpaOnnxOfflineTdnnModelConfig = record
    Model: AnsiString;
    function ToString: AnsiString;
@@ -285,6 +293,7 @@ type
    BpeVocab: AnsiString;
    TeleSpeechCtc: AnsiString;
    SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig;
+    Moonshine: TSherpaOnnxOfflineMoonshineModelConfig;
    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
    function ToString: AnsiString;
  end;
@@ -617,6 +626,12 @@ type
    Task: PAnsiChar;
    TailPaddings: cint32;
  end;
+  SherpaOnnxOfflineMoonshineModelConfig = record
+    Preprocessor: PAnsiChar;
+    Encoder: PAnsiChar;
+    UncachedDecoder: PAnsiChar;
+    CachedDecoder: PAnsiChar;
+  end;
  SherpaOnnxOfflineTdnnModelConfig = record
    Model: PAnsiChar;
  end;
@@ -644,6 +659,7 @@ type
    BpeVocab: PAnsiChar;
    TeleSpeechCtc: PAnsiChar;
    SenseVoice:  SherpaOnnxOfflineSenseVoiceModelConfig;
+    Moonshine: SherpaOnnxOfflineMoonshineModelConfig;
  end;

  SherpaOnnxOfflineRecognizerConfig = record
@@ -1312,6 +1328,16 @@ begin
    [Self.Encoder, Self.Decoder, Self.Language, Self.Task, Self.TailPaddings]);
 end;

+function TSherpaOnnxOfflineMoonshineModelConfig.ToString: AnsiString;
+begin
+  Result := Format('TSherpaOnnxOfflineMoonshineModelConfig(' +
+    'Preprocessor := %s, ' +
+    'Encoder := %s, ' +
+    'UncachedDecoder := %s, ' +
+    'CachedDecoder := %s)',
+    [Self.Preprocessor, Self.Encoder, Self.UncachedDecoder, Self.CachedDecoder]);
+end;
+
 function TSherpaOnnxOfflineTdnnModelConfig.ToString: AnsiString;
 begin
  Result := Format('TSherpaOnnxOfflineTdnnModelConfig(Model := %s)',
@@ -1353,13 +1379,14 @@ begin
    'ModelingUnit := %s, ' +
    'BpeVocab := %s, ' +
    'TeleSpeechCtc := %s, ' +
-    'SenseVoice := %s' +
+    'SenseVoice := %s, ' +
+    'Moonshine := %s' +
    ')',
    [Self.Transducer.ToString, Self.Paraformer.ToString,
     Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString,
     Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider,
     Self.ModelType, Self.ModelingUnit, Self.BpeVocab,
-     Self.TeleSpeechCtc, Self.SenseVoice.ToString
+     Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString
     ]);
 end;

@@ -1407,7 +1434,6 @@ begin

  C.ModelConfig.Tdnn.Model := PAnsiChar(Config.ModelConfig.Tdnn.Model);

-
  C.ModelConfig.Tokens := PAnsiChar(Config.ModelConfig.Tokens);
  C.ModelConfig.NumThreads := Config.ModelConfig.NumThreads;
  C.ModelConfig.Debug := Ord(Config.ModelConfig.Debug);
@@ -1421,6 +1447,11 @@ begin
  C.ModelConfig.SenseVoice.Language := PAnsiChar(Config.ModelConfig.SenseVoice.Language);
  C.ModelConfig.SenseVoice.UseItn := Ord(Config.ModelConfig.SenseVoice.UseItn);

+  C.ModelConfig.Moonshine.Preprocessor := PAnsiChar(Config.ModelConfig.Moonshine.Preprocessor);
+  C.ModelConfig.Moonshine.Encoder := PAnsiChar(Config.ModelConfig.Moonshine.Encoder);
+  C.ModelConfig.Moonshine.UncachedDecoder := PAnsiChar(Config.ModelConfig.Moonshine.UncachedDecoder);
+  C.ModelConfig.Moonshine.CachedDecoder := PAnsiChar(Config.ModelConfig.Moonshine.CachedDecoder);
+
  C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model);
  C.LMConfig.Scale := Config.LMConfig.Scale;