Add Pascal API for Dolphin CTC models (#2096)

2025-04-03 16:00:22 +08:00
parent 07a5701af6
commit 8137ac9f0b
11 changed files with 343 additions and 7 deletions
--- a/.github/workflows/pascal.yaml
+++ b/.github/workflows/pascal.yaml
@@ -149,6 +149,11 @@ jobs:
          cd ./pascal-api-examples
          pushd non-streaming-asr
          ./run-dolphin-ctc.sh
          rm -rf sherpa-onnx-*
          echo "---"
          ./run-zipformer-transducer.sh
          rm -rf sherpa-onnx-*
          echo "---"
@@ -253,7 +258,13 @@ jobs:
          cd ./pascal-api-examples
          pushd vad-with-non-streaming-asr
          time ./run-vad-with-dolphin-ctc.sh
          rm -rf sherpa-onnx-*
          echo "---"
          time ./run-vad-with-moonshine.sh
          rm -rf sherpa-onnx-*
          echo "---"
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ This repository supports running the following functions **locally**
 on the following platforms and operating systems:
-  - x86, ``x86_64``, 32-bit ARM, 64-bit ARM (arm64, aarch64), RISC-V (riscv64)
+  - x86, ``x86_64``, 32-bit ARM, 64-bit ARM (arm64, aarch64), RISC-V (riscv64), **RK NPU**
  - Linux, macOS, Windows, openKylin
  - Android, WearOS
  - iOS
--- a/pascal-api-examples/non-streaming-asr/README.md
+++ b/pascal-api-examples/non-streaming-asr/README.md
@@ -5,6 +5,7 @@ APIs with non-streaming models for speech recognition.
 |File|Description|
 |----|-----------|
 |[run-dolphin-ctc.sh](./run-dolphin-ctc.sh)|Use a non-streaming [Dolphin](https://github.com/DataoceanAI/Dolphin) CTC model for speech recognition|
 |[run-nemo-ctc.sh](./run-nemo-ctc.sh)|Use a non-streaming NeMo CTC model for speech recognition|
 |[run-nemo-transducer.sh](./run-nemo-transducer.sh)|Use a non-streaming NeMo transducer model for speech recognition|
 |[run-paraformer-itn.sh](./run-paraformer-itn.sh)|Use a non-streaming Paraformer model for speech recognition with inverse text normalization for numbers|
--- a/pascal-api-examples/non-streaming-asr/dolphin_ctc.pas
+++ b/pascal-api-examples/non-streaming-asr/dolphin_ctc.pas
@@ -0,0 +1,76 @@
 { Copyright (c)  2025  Xiaomi Corporation }
 {
 This file shows how to use a non-streaming Dolphin CTC model
 to decode files.
 You can download the model files from
 https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
 }
 program dolphin_ctc;
 {$mode objfpc}
 uses
  sherpa_onnx,
  DateUtils,
  SysUtils;
 var
  Wave: TSherpaOnnxWave;
  WaveFilename: AnsiString;
  Config: TSherpaOnnxOfflineRecognizerConfig;
  Recognizer: TSherpaOnnxOfflineRecognizer;
  Stream: TSherpaOnnxOfflineStream;
  RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
  Start: TDateTime;
  Stop: TDateTime;
  Elapsed: Single;
  Duration: Single;
  RealTimeFactor: Single;
 begin
  Initialize(Config);
  Config.ModelConfig.Dolphin.Model := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx';
  Config.ModelConfig.Tokens := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt';
  Config.ModelConfig.Provider := 'cpu';
  Config.ModelConfig.NumThreads := 1;
  Config.ModelConfig.Debug := False;
  WaveFilename := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav';
  Wave := SherpaOnnxReadWave(WaveFilename);
  Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
  Stream := Recognizer.CreateStream();
  Start := Now;
  Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
  Recognizer.Decode(Stream);
  RecognitionResult := Recognizer.GetResult(Stream);
  Stop := Now;
  Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
  Duration := Length(Wave.Samples) / Wave.SampleRate;
  RealTimeFactor := Elapsed / Duration;
  WriteLn(RecognitionResult.ToString);
  WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
  WriteLn(Format('Elapsed %.3f s', [Elapsed]));
  WriteLn(Format('Wave duration %.3f s', [Duration]));
  WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));
  {Free resources to avoid memory leak.
  Note: You don't need to invoke them for this simple script.
  However, you have to invoke them in your own large/complex project.
  }
  FreeAndNil(Stream);
  FreeAndNil(Recognizer);
 end.
--- a/pascal-api-examples/non-streaming-asr/run-dolphin-ctc.sh
+++ b/pascal-api-examples/non-streaming-asr/run-dolphin-ctc.sh
@@ -0,0 +1,42 @@
 #!/usr/bin/env bash
 set -ex
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
 echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
 if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  mkdir -p ../../build
  pushd ../../build
  cmake \
    -DCMAKE_INSTALL_PREFIX=./install \
    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
    -DBUILD_SHARED_LIBS=ON \
    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
    ..
  cmake --build . --target install --config Release
  ls -lh lib
  popd
 fi
 if [ ! -f ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx ]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
 fi
 fpc \
  -dSHERPA_ONNX_USE_SHARED_LIBS \
  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  -Fl$SHERPA_ONNX_DIR/build/install/lib \
  ./dolphin_ctc.pas
 export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
 export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
 ./dolphin_ctc
--- a/pascal-api-examples/vad-with-non-streaming-asr/README.md
+++ b/pascal-api-examples/vad-with-non-streaming-asr/README.md
@@ -6,7 +6,10 @@ with non-streaming speech recognition models.
 |Directory| Description|
 |---------|------------|
-|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + Whisper for speech recognition.|
+|[run-vad-with-dolphin-ctc.sh](./run-vad-with-dolphin-ctc.sh)|It shows how to use the VAD + [Dolphin](https://github.com/DataoceanAI/Dolphin) for speech recognition.|
-|[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + SenseVoice for speech recognition.|
+|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + [Whisper](https://github.com/openai/whisper) for speech recognition.|
 |[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + [SenseVoice](https://github.com/FunAudioLLM/SenseVoice) for speech recognition.|
 |[run-vad-with-moonshine.sh](./run-vad-with-moonshine.sh)|It shows how to use the VAD + [Moonshine](https://github.com/usefulsensors/moonshine) for speech recognition.|
 Please refer to [non-streaming-asr](../non-streaming-asr) for more kinds of non-streaming models.
--- a/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-dolphin-ctc.sh
+++ b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-dolphin-ctc.sh
@@ -0,0 +1,49 @@
 #!/usr/bin/env bash
 set -ex
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
 echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
 if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  mkdir -p ../../build
  pushd ../../build
  cmake \
    -DCMAKE_INSTALL_PREFIX=./install \
    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
    -DBUILD_SHARED_LIBS=ON \
    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
    ..
  cmake --build . --target install --config Release
  popd
 fi
 if [[ ! -f ./silero_vad.onnx ]]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
 fi
 if [ ! -f ./lei-jun-test.wav ]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
 fi
 if [ ! -f ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx ]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
 fi
 fpc \
  -dSHERPA_ONNX_USE_SHARED_LIBS \
  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  -Fl$SHERPA_ONNX_DIR/build/install/lib \
  ./vad_with_dolphin.pas
 export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
 export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
 ./vad_with_dolphin
--- a/pascal-api-examples/vad-with-non-streaming-asr/vad_with_dolphin.pas
+++ b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_dolphin.pas
@@ -0,0 +1,135 @@
 { Copyright (c)  2025  Xiaomi Corporation }
 {
 This file shows how to use a non-streaming Dolphin model
 with silero VAD to decode files.
 You can download the model files from
 https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
 }
 program vad_with_dolphin;
 {$mode objfpc}
 uses
  sherpa_onnx,
  SysUtils;
 function CreateVad(): TSherpaOnnxVoiceActivityDetector;
 var
  Config: TSherpaOnnxVadModelConfig;
  SampleRate: Integer;
  WindowSize: Integer;
 begin
  Initialize(Config);
  SampleRate := 16000; {Please don't change it unless you know the details}
  WindowSize := 512; {Please don't change it unless you know the details}
  Config.SileroVad.Model := './silero_vad.onnx';
  Config.SileroVad.MinSpeechDuration := 0.5;
  Config.SileroVad.MinSilenceDuration := 0.5;
  Config.SileroVad.Threshold := 0.5;
  Config.SileroVad.WindowSize := WindowSize;
  Config.NumThreads:= 1;
  Config.Debug:= True;
  Config.Provider:= 'cpu';
  Config.SampleRate := SampleRate;
  Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
 end;
 function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
 var
  Config: TSherpaOnnxOfflineRecognizerConfig;
 begin
  Initialize(Config);
  Config.ModelConfig.Dolphin.Model := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx';
  Config.ModelConfig.Tokens := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt';
  Config.ModelConfig.Provider := 'cpu';
  Config.ModelConfig.NumThreads := 1;
  Config.ModelConfig.Debug := False;
  Result := TSherpaOnnxOfflineRecognizer.Create(Config);
 end;
 var
  Wave: TSherpaOnnxWave;
  Recognizer: TSherpaOnnxOfflineRecognizer;
  Vad: TSherpaOnnxVoiceActivityDetector;
  Offset: Integer;
  WindowSize: Integer;
  SpeechSegment: TSherpaOnnxSpeechSegment;
  Start: Single;
  Duration: Single;
  Stream: TSherpaOnnxOfflineStream;
  RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
 begin
  Vad := CreateVad();
  Recognizer := CreateOfflineRecognizer();
  Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
  if Wave.SampleRate <> Vad.Config.SampleRate then
    begin
      WriteLn(Format('Expected sample rate: %d. Given: %d',
        [Vad.Config.SampleRate, Wave.SampleRate]));
      Exit;
    end;
  WindowSize := Vad.Config.SileroVad.WindowSize;
  Offset := 0;
  while Offset + WindowSize <= Length(Wave.Samples) do
    begin
      Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
      Offset += WindowSize;
      while not Vad.IsEmpty do
        begin
          SpeechSegment := Vad.Front();
          Vad.Pop();
          Stream := Recognizer.CreateStream();
          Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
          Recognizer.Decode(Stream);
          RecognitionResult := Recognizer.GetResult(Stream);
          Start := SpeechSegment.Start / Wave.SampleRate;
          Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
          WriteLn(Format('%.3f -- %.3f %s',
            [Start, Start + Duration, RecognitionResult.Text]));
          FreeAndNil(Stream);
        end;
    end;
  Vad.Flush;
  while not Vad.IsEmpty do
    begin
      SpeechSegment := Vad.Front();
      Vad.Pop();
      Stream := Recognizer.CreateStream();
      Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
      Recognizer.Decode(Stream);
      RecognitionResult := Recognizer.GetResult(Stream);
      Start := SpeechSegment.Start / Wave.SampleRate;
      Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
      WriteLn(Format('%.3f -- %.3f %s',
        [Start, Start + Duration, RecognitionResult.Text]));
      FreeAndNil(Stream);
    end;
  FreeAndNil(Recognizer);
  FreeAndNil(Vad);
 end.
--- a/pascal-api-examples/vad-with-non-streaming-asr/vad_with_sense_voice.pas
+++ b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_sense_voice.pas
@@ -8,7 +8,7 @@ You can download the model files from
 https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
 }
-program vad_with_whisper;
+program vad_with_sense_voice;
 {$mode objfpc}
--- a/sherpa-onnx/c-api/c-api.cc
+++ b/sherpa-onnx/c-api/c-api.cc
@@ -1969,7 +1969,7 @@ int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
  return p->impl->GetOutputSamplingRate();
 }
-void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) {
+void SherpaOnnxLinearResamplerReset(const SherpaOnnxLinearResampler *p) {
  p->impl->Reset();
 }
--- a/sherpa-onnx/pascal-api/sherpa_onnx.pas
+++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas
@@ -270,6 +270,11 @@ type
    function ToString: AnsiString;
  end;
  TSherpaOnnxOfflineDolphinModelConfig = record
    Model: AnsiString;
    function ToString: AnsiString;
  end;
  TSherpaOnnxOfflineWhisperModelConfig = record
    Encoder: AnsiString;
    Decoder: AnsiString;
@@ -331,6 +336,7 @@ type
    SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig;
    Moonshine: TSherpaOnnxOfflineMoonshineModelConfig;
    FireRedAsr: TSherpaOnnxOfflineFireRedAsrModelConfig;
    Dolphin: TSherpaOnnxOfflineDolphinModelConfig;
    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
    function ToString: AnsiString;
  end;
@@ -694,6 +700,9 @@ type
  SherpaOnnxOfflineNemoEncDecCtcModelConfig = record
    Model: PAnsiChar;
  end;
  SherpaOnnxOfflineDolphinModelConfig = record
    Model: PAnsiChar;
  end;
  SherpaOnnxOfflineWhisperModelConfig = record
    Encoder: PAnsiChar;
    Decoder: PAnsiChar;
@@ -740,6 +749,7 @@ type
    SenseVoice:  SherpaOnnxOfflineSenseVoiceModelConfig;
    Moonshine: SherpaOnnxOfflineMoonshineModelConfig;
    FireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig;
    Dolphin: SherpaOnnxOfflineDolphinModelConfig;
  end;
  SherpaOnnxOfflineRecognizerConfig = record
@@ -1461,6 +1471,12 @@ begin
    [Self.Model]);
 end;
 function TSherpaOnnxOfflineDolphinModelConfig.ToString: AnsiString;
 begin
  Result := Format('TSherpaOnnxOfflineDolphinModelConfig(Model := %s)',
    [Self.Model]);
 end;
 function TSherpaOnnxOfflineWhisperModelConfig.ToString: AnsiString;
 begin
  Result := Format('TSherpaOnnxOfflineWhisperModelConfig(' +
@@ -1534,14 +1550,15 @@ begin
    'TeleSpeechCtc := %s, ' +
    'SenseVoice := %s, ' +
    'Moonshine := %s, ' +
-    'FireRedAsr := %s' +
+    'FireRedAsr := %s, ' +
    'Dolphin := %s' +
    ')',
    [Self.Transducer.ToString, Self.Paraformer.ToString,
     Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString,
     Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider,
     Self.ModelType, Self.ModelingUnit, Self.BpeVocab,
     Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString,
-     Self.FireRedAsr.ToString
+     Self.FireRedAsr.ToString, Self.Dolphin.ToString
     ]);
 end;
@@ -1610,6 +1627,8 @@ begin
  C.ModelConfig.FireRedAsr.Encoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Encoder);
  C.ModelConfig.FireRedAsr.Decoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Decoder);
  C.ModelConfig.Dolphin.Model := PAnsiChar(Config.ModelConfig.Dolphin.Model);
  C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model);
  C.LMConfig.Scale := Config.LMConfig.Scale;