From fd9a687ec2833c79a05ff3d72f793baf6ba3c0a9 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 10 Jul 2025 14:53:33 +0800 Subject: [PATCH] Add Pascal/Go/C#/Dart API for NeMo Canary ASR models (#2367) Add support for the new NeMo Canary ASR model across multiple language bindings by introducing a Canary model configuration and setter method on the offline recognizer. - Define Canary model config in Pascal, Go, C#, Dart and update converter functions - Add SetConfig API for offline recognizer (Pascal, Go, C#, Dart) - Extend CI/workflows and example scripts to test non-streaming Canary decoding --- .github/scripts/test-dot-net.sh | 5 + .github/workflows/pascal.yaml | 4 + .github/workflows/test-go-package.yaml | 8 ++ .github/workflows/test-go.yaml | 14 +++ .../non-streaming-asr/bin/nemo-canary.dart | 84 +++++++++++++ .../non-streaming-asr/run-nemo-canary.sh | 33 +++++ .../Program.cs | 44 +++++++ .../non-streaming-canary-decode-files.csproj | 15 +++ .../non-streaming-canary-decode-files/run.sh | 11 ++ dotnet-examples/sherpa-onnx.sln | 6 + .../lib/src/offline_recognizer.dart | 84 ++++++++++++- .../lib/src/sherpa_onnx_bindings.dart | 25 ++++ .../non-streaming-canary-decode-files/go.mod | 17 +++ .../non-streaming-canary-decode-files/main.go | 113 ++++++++++++++++++ .../non-streaming-canary-decode-files/run.sh | 13 ++ .../non-streaming-asr/.gitignore | 1 + .../non-streaming-asr/nemo_canary.pas | 107 +++++++++++++++++ .../non-streaming-asr/run-nemo-canary.sh | 42 +++++++ scripts/dotnet/OfflineCanaryModelConfig.cs | 32 +++++ scripts/dotnet/OfflineModelConfig.cs | 2 + scripts/dotnet/OfflineRecognizer.cs | 8 ++ .../non-streaming-canary-decode-files/go.mod | 5 + .../non-streaming-canary-decode-files/main.go | 1 + .../non-streaming-canary-decode-files/run.sh | 1 + scripts/go/sherpa_onnx.go | 35 ++++++ .../csrc/online-transducer-nemo-model.cc | 3 +- sherpa-onnx/pascal-api/sherpa_onnx.pas | 74 +++++++++++- 27 files changed, 779 insertions(+), 8 deletions(-) create mode 100644 dart-api-examples/non-streaming-asr/bin/nemo-canary.dart create mode 100755 dart-api-examples/non-streaming-asr/run-nemo-canary.sh create mode 100644 dotnet-examples/non-streaming-canary-decode-files/Program.cs create mode 100644 dotnet-examples/non-streaming-canary-decode-files/non-streaming-canary-decode-files.csproj create mode 100755 dotnet-examples/non-streaming-canary-decode-files/run.sh create mode 100644 go-api-examples/non-streaming-canary-decode-files/go.mod create mode 100644 go-api-examples/non-streaming-canary-decode-files/main.go create mode 100755 go-api-examples/non-streaming-canary-decode-files/run.sh create mode 100644 pascal-api-examples/non-streaming-asr/nemo_canary.pas create mode 100755 pascal-api-examples/non-streaming-asr/run-nemo-canary.sh create mode 100644 scripts/dotnet/OfflineCanaryModelConfig.cs create mode 100644 scripts/go/_internal/non-streaming-canary-decode-files/go.mod create mode 120000 scripts/go/_internal/non-streaming-canary-decode-files/main.go create mode 120000 scripts/go/_internal/non-streaming-canary-decode-files/run.sh diff --git a/.github/scripts/test-dot-net.sh b/.github/scripts/test-dot-net.sh index 6efa11b7..c3719213 100755 --- a/.github/scripts/test-dot-net.sh +++ b/.github/scripts/test-dot-net.sh @@ -6,6 +6,11 @@ cd ./version-test ./run.sh ls -lh +cd ../non-streaming-canary-decode-files +./run.sh +ls -lh +rm -rf sherpa-onnx-nemo-* + cd ../offline-decode-files ./run-zipformer-ctc.sh diff --git a/.github/workflows/pascal.yaml b/.github/workflows/pascal.yaml index 7a07bce3..fdc55ddd 100644 --- a/.github/workflows/pascal.yaml +++ b/.github/workflows/pascal.yaml @@ -156,6 +156,10 @@ jobs: pushd non-streaming-asr + ./run-nemo-canary.sh + rm -rf sherpa-onnx-* + echo "---" + ./run-zipformer-ctc.sh rm -rf sherpa-onnx-* echo "---" diff --git a/.github/workflows/test-go-package.yaml b/.github/workflows/test-go-package.yaml index bf96d85e..cafbed7c 100644 --- a/.github/workflows/test-go-package.yaml +++ b/.github/workflows/test-go-package.yaml @@ -76,6 +76,14 @@ jobs: run: | gcc --version + - name: Test NeMo Canary ASR + if: matrix.os != 'windows-latest' + shell: bash + run: | + cd go-api-examples/non-streaming-canary-decode-files + ./run.sh + rm -rf sherpa-onnx-nemo-* + - name: Test speech enhancement (GTCRN) if: matrix.os != 'windows-latest' shell: bash diff --git a/.github/workflows/test-go.yaml b/.github/workflows/test-go.yaml index 18df2474..7fe72f05 100644 --- a/.github/workflows/test-go.yaml +++ b/.github/workflows/test-go.yaml @@ -108,6 +108,7 @@ jobs: cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/add-punctuation cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/audio-tagging cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/keyword-spotting-from-file/ + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/non-streaming-canary-decode-files/ cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/non-streaming-decode-files/ cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/non-streaming-speaker-diarization/ cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/non-streaming-tts/ @@ -148,6 +149,19 @@ jobs: name: ${{ matrix.os }}-libs path: to-upload/ + - name: Test non-streaming decoding files with NeMo Canary + shell: bash + run: | + cd scripts/go/_internal/non-streaming-canary-decode-files/ + ls -lh + go mod tidy + cat go.mod + go build + ls -lh + + ./run.sh + rm -rf sherpa-onnx-nemo-* + - name: Test streaming decoding files shell: bash run: | diff --git a/dart-api-examples/non-streaming-asr/bin/nemo-canary.dart b/dart-api-examples/non-streaming-asr/bin/nemo-canary.dart new file mode 100644 index 00000000..fe233edf --- /dev/null +++ b/dart-api-examples/non-streaming-asr/bin/nemo-canary.dart @@ -0,0 +1,84 @@ +// Copyright (c) 2025 Xiaomi Corporation +import 'dart:io'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('encoder', help: 'Path to the NeMo Canary encoder model') + ..addOption('decoder', help: 'Path to the NeMo Canary decoder model') + ..addOption('src-lang', help: 'Language of the input audio') + ..addOption('tgt-lang', help: 'Language of the recognition result') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['encoder'] == null || + res['decoder'] == null || + res['src-lang'] == null || + res['tgt-lang'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final encoder = res['encoder'] as String; + final decoder = res['decoder'] as String; + final srcLang = res['src-lang'] as String; + final tgtLang = res['tgt-lang'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final canary = sherpa_onnx.OfflineCanaryModelConfig( + encoder: encoder, decoder: decoder, srcLang: srcLang, tgtLang: tgtLang); + + final modelConfig = sherpa_onnx.OfflineModelConfig( + canary: canary, + tokens: tokens, + debug: false, + numThreads: 1, + ); + var config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OfflineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + stream.acceptWaveform( + samples: waveData.samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + print('Result in $tgtLang: ${result.text}'); + + stream.free(); + + // Example to change the target language to de + if (tgtLang != 'en') { + var json = config.toJson(); + + ((json['model'] as Map)!['canary'] + as Map)!['tgtLang'] = 'en'; + + config = sherpa_onnx.OfflineRecognizerConfig.fromJson(json); + recognizer.setConfig(config); + + final stream = recognizer.createStream(); + + stream.acceptWaveform( + samples: waveData.samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + print('Result in English: ${result.text}'); + stream.free(); + } + + recognizer.free(); +} diff --git a/dart-api-examples/non-streaming-asr/run-nemo-canary.sh b/dart-api-examples/non-streaming-asr/run-nemo-canary.sh new file mode 100755 index 00000000..8b7e1e42 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/run-nemo-canary.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +fi + +for tgt_lang in en de es fr; do + dart run \ + ./bin/nemo-canary.dart \ + --encoder ./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx \ + --decoder ./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/decoder.int8.onnx \ + --tokens ./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/tokens.txt \ + --src-lang en \ + --tgt-lang $tgt_lang \ + --input-wav ./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/test_wavs/en.wav +done + +for tgt_lang in en de; do + dart run \ + ./bin/nemo-canary.dart \ + --encoder ./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx \ + --decoder ./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/decoder.int8.onnx \ + --tokens ./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/tokens.txt \ + --src-lang de \ + --tgt-lang $tgt_lang \ + --input-wav ./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/test_wavs/de.wav +done diff --git a/dotnet-examples/non-streaming-canary-decode-files/Program.cs b/dotnet-examples/non-streaming-canary-decode-files/Program.cs new file mode 100644 index 00000000..f193a642 --- /dev/null +++ b/dotnet-examples/non-streaming-canary-decode-files/Program.cs @@ -0,0 +1,44 @@ +// Copyright (c) 2025 Xiaomi Corporation +// +// This file shows how to use a NeMo Canary model for speech recognition. +// +// You can find the model doc at +// https://k2-fsa.github.io/sherpa/onnx/nemo/canary.html +using SherpaOnnx; + +class NonStreamingAsrCanary +{ + static void Main(string[] args) + { + // please download model files from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + var config = new OfflineRecognizerConfig(); + config.ModelConfig.Canary.Encoder = "./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx"; + config.ModelConfig.Canary.Decoder = "./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/decoder.int8.onnx"; + config.ModelConfig.Canary.SrcLang = "en"; + config.ModelConfig.Canary.TgtLang = "en"; + config.ModelConfig.Tokens = "./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/tokens.txt"; + config.ModelConfig.Debug = 0; + var recognizer = new OfflineRecognizer(config); + + var testWaveFilename = "./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/test_wavs/en.wav"; + var reader = new WaveReader(testWaveFilename); + var stream = recognizer.CreateStream(); + stream.AcceptWaveform(reader.SampleRate, reader.Samples); + recognizer.Decode(stream); + var text = stream.Result.Text; + Console.WriteLine("Text (English): {0}", text); + + // Now output text in German + config.ModelConfig.Canary.TgtLang = "de"; + recognizer.SetConfig(config); + + stream = recognizer.CreateStream(); + stream.AcceptWaveform(reader.SampleRate, reader.Samples); + recognizer.Decode(stream); + text = stream.Result.Text; + Console.WriteLine("Text (German): {0}", text); + } +} + + diff --git a/dotnet-examples/non-streaming-canary-decode-files/non-streaming-canary-decode-files.csproj b/dotnet-examples/non-streaming-canary-decode-files/non-streaming-canary-decode-files.csproj new file mode 100644 index 00000000..e4abf5b0 --- /dev/null +++ b/dotnet-examples/non-streaming-canary-decode-files/non-streaming-canary-decode-files.csproj @@ -0,0 +1,15 @@ + + + + Exe + net8.0 + non_streaming_canary_decode_files + enable + enable + + + + + + + diff --git a/dotnet-examples/non-streaming-canary-decode-files/run.sh b/dotnet-examples/non-streaming-canary-decode-files/run.sh new file mode 100755 index 00000000..65bf4ade --- /dev/null +++ b/dotnet-examples/non-streaming-canary-decode-files/run.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -f sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +fi + +dotnet run diff --git a/dotnet-examples/sherpa-onnx.sln b/dotnet-examples/sherpa-onnx.sln index 65706b87..24255813 100644 --- a/dotnet-examples/sherpa-onnx.sln +++ b/dotnet-examples/sherpa-onnx.sln @@ -39,6 +39,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "speech-enhancement-gtcrn", EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "version-test", "version-test\version-test.csproj", "{E57711E5-6546-4BA0-B627-79C94F415BC5}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "non-streaming-canary-decode-files", "non-streaming-canary-decode-files\non-streaming-canary-decode-files.csproj", "{925779DB-4429-4366-87C3-B14DD44AE1D4}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -117,6 +119,10 @@ Global {E57711E5-6546-4BA0-B627-79C94F415BC5}.Debug|Any CPU.Build.0 = Debug|Any CPU {E57711E5-6546-4BA0-B627-79C94F415BC5}.Release|Any CPU.ActiveCfg = Release|Any CPU {E57711E5-6546-4BA0-B627-79C94F415BC5}.Release|Any CPU.Build.0 = Release|Any CPU + {925779DB-4429-4366-87C3-B14DD44AE1D4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {925779DB-4429-4366-87C3-B14DD44AE1D4}.Debug|Any CPU.Build.0 = Debug|Any CPU + {925779DB-4429-4366-87C3-B14DD44AE1D4}.Release|Any CPU.ActiveCfg = Release|Any CPU + {925779DB-4429-4366-87C3-B14DD44AE1D4}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/flutter/sherpa_onnx/lib/src/offline_recognizer.dart b/flutter/sherpa_onnx/lib/src/offline_recognizer.dart index 892e2b02..05e6f54d 100644 --- a/flutter/sherpa_onnx/lib/src/offline_recognizer.dart +++ b/flutter/sherpa_onnx/lib/src/offline_recognizer.dart @@ -163,6 +163,44 @@ class OfflineWhisperModelConfig { final int tailPaddings; } +class OfflineCanaryModelConfig { + const OfflineCanaryModelConfig( + {this.encoder = '', + this.decoder = '', + this.srcLang = 'en', + this.tgtLang = 'en', + this.usePnc = true}); + + factory OfflineCanaryModelConfig.fromJson(Map json) { + return OfflineCanaryModelConfig( + encoder: json['encoder'] as String? ?? '', + decoder: json['decoder'] as String? ?? '', + srcLang: json['srcLang'] as String? ?? 'en', + tgtLang: json['tgtLang'] as String? ?? 'en', + usePnc: json['usePnc'] as bool? ?? true, + ); + } + + @override + String toString() { + return 'OfflineCanaryModelConfig(encoder: $encoder, decoder: $decoder, srcLang: $srcLang, tgtLang: $tgtLang, usePnc: $usePnc)'; + } + + Map toJson() => { + 'encoder': encoder, + 'decoder': decoder, + 'srcLang': srcLang, + 'tgtLang': tgtLang, + 'usePnc': usePnc, + }; + + final String encoder; + final String decoder; + final String srcLang; + final String tgtLang; + final bool usePnc; +} + class OfflineFireRedAsrModelConfig { const OfflineFireRedAsrModelConfig({this.encoder = '', this.decoder = ''}); @@ -310,6 +348,7 @@ class OfflineModelConfig { this.fireRedAsr = const OfflineFireRedAsrModelConfig(), this.dolphin = const OfflineDolphinModelConfig(), this.zipformerCtc = const OfflineZipformerCtcModelConfig(), + this.canary = const OfflineCanaryModelConfig(), required this.tokens, this.numThreads = 1, this.debug = true, @@ -362,6 +401,10 @@ class OfflineModelConfig { ? OfflineZipformerCtcModelConfig.fromJson( json['zipformerCtc'] as Map) : const OfflineZipformerCtcModelConfig(), + canary: json['canary'] != null + ? OfflineCanaryModelConfig.fromJson( + json['canary'] as Map) + : const OfflineCanaryModelConfig(), tokens: json['tokens'] as String, numThreads: json['numThreads'] as int? ?? 1, debug: json['debug'] as bool? ?? true, @@ -375,7 +418,7 @@ class OfflineModelConfig { @override String toString() { - return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, dolphin: $dolphin, zipformerCtc: $zipformerCtc, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)'; + return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, dolphin: $dolphin, zipformerCtc: $zipformerCtc, canary: $canary, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)'; } Map toJson() => { @@ -389,6 +432,7 @@ class OfflineModelConfig { 'fireRedAsr': fireRedAsr.toJson(), 'dolphin': dolphin.toJson(), 'zipformerCtc': zipformerCtc.toJson(), + 'canary': canary.toJson(), 'tokens': tokens, 'numThreads': numThreads, 'debug': debug, @@ -409,6 +453,7 @@ class OfflineModelConfig { final OfflineFireRedAsrModelConfig fireRedAsr; final OfflineDolphinModelConfig dolphin; final OfflineZipformerCtcModelConfig zipformerCtc; + final OfflineCanaryModelConfig canary; final String tokens; final int numThreads; @@ -549,7 +594,28 @@ class OfflineRecognizer { /// The user is responsible to call the OfflineRecognizer.free() /// method of the returned instance to avoid memory leak. + factory OfflineRecognizer(OfflineRecognizerConfig config) { + final c = convertConfig(config); + + final ptr = SherpaOnnxBindings.createOfflineRecognizer?.call(c) ?? nullptr; + + freeConfig(c); + + return OfflineRecognizer._(ptr: ptr, config: config); + } + + void setConfig(OfflineRecognizerConfig config) { + final c = convertConfig(config); + + SherpaOnnxBindings.offlineRecognizerSetConfig?.call(ptr, c); + + freeConfig(c); + // we don't update this.config + } + + static Pointer convertConfig( + OfflineRecognizerConfig config) { final c = calloc(); c.ref.feat.sampleRate = config.feat.sampleRate; @@ -609,6 +675,12 @@ class OfflineRecognizer { c.ref.model.zipformerCtc.model = config.model.zipformerCtc.model.toNativeUtf8(); + c.ref.model.canary.encoder = config.model.canary.encoder.toNativeUtf8(); + c.ref.model.canary.decoder = config.model.canary.decoder.toNativeUtf8(); + c.ref.model.canary.srcLang = config.model.canary.srcLang.toNativeUtf8(); + c.ref.model.canary.tgtLang = config.model.canary.tgtLang.toNativeUtf8(); + c.ref.model.canary.usePnc = config.model.canary.usePnc ? 1 : 0; + c.ref.model.tokens = config.model.tokens.toNativeUtf8(); c.ref.model.numThreads = config.model.numThreads; @@ -637,8 +709,10 @@ class OfflineRecognizer { c.ref.hr.lexicon = config.hr.lexicon.toNativeUtf8(); c.ref.hr.ruleFsts = config.hr.ruleFsts.toNativeUtf8(); - final ptr = SherpaOnnxBindings.createOfflineRecognizer?.call(c) ?? nullptr; + return c; + } + static void freeConfig(Pointer c) { calloc.free(c.ref.hr.dictDir); calloc.free(c.ref.hr.lexicon); calloc.free(c.ref.hr.ruleFsts); @@ -653,6 +727,10 @@ class OfflineRecognizer { calloc.free(c.ref.model.modelType); calloc.free(c.ref.model.provider); calloc.free(c.ref.model.tokens); + calloc.free(c.ref.model.canary.tgtLang); + calloc.free(c.ref.model.canary.srcLang); + calloc.free(c.ref.model.canary.decoder); + calloc.free(c.ref.model.canary.encoder); calloc.free(c.ref.model.zipformerCtc.model); calloc.free(c.ref.model.dolphin.model); calloc.free(c.ref.model.fireRedAsr.decoder); @@ -674,8 +752,6 @@ class OfflineRecognizer { calloc.free(c.ref.model.transducer.decoder); calloc.free(c.ref.model.transducer.joiner); calloc.free(c); - - return OfflineRecognizer._(ptr: ptr, config: config); } /// The user has to invoke stream.free() on the returned instance diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart index 0db40397..f104fc39 100644 --- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart +++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart @@ -280,6 +280,16 @@ final class SherpaOnnxOfflineWhisperModelConfig extends Struct { external int tailPaddings; } +final class SherpaOnnxOfflineCanaryModelConfig extends Struct { + external Pointer encoder; + external Pointer decoder; + external Pointer srcLang; + external Pointer tgtLang; + + @Int32() + external int usePnc; +} + final class SherpaOnnxOfflineMoonshineModelConfig extends Struct { external Pointer preprocessor; external Pointer encoder; @@ -338,6 +348,7 @@ final class SherpaOnnxOfflineModelConfig extends Struct { external SherpaOnnxOfflineFireRedAsrModelConfig fireRedAsr; external SherpaOnnxOfflineDolphinModelConfig dolphin; external SherpaOnnxOfflineZipformerCtcModelConfig zipformerCtc; + external SherpaOnnxOfflineCanaryModelConfig canary; } final class SherpaOnnxOfflineRecognizerConfig extends Struct { @@ -876,6 +887,14 @@ typedef CreateOfflineRecognizerNative = Pointer typedef CreateOfflineRecognizer = CreateOfflineRecognizerNative; +typedef OfflineRecognizerSetConfigNative = Void Function( + Pointer, + Pointer); + +typedef OfflineRecognizerSetConfig = void Function( + Pointer, + Pointer); + typedef DestroyOfflineRecognizerNative = Void Function( Pointer); @@ -1341,6 +1360,7 @@ class SherpaOnnxBindings { static CreateOfflineRecognizer? createOfflineRecognizer; static DestroyOfflineRecognizer? destroyOfflineRecognizer; + static OfflineRecognizerSetConfig? offlineRecognizerSetConfig; static CreateOfflineStream? createOfflineStream; static DestroyOfflineStream? destroyOfflineStream; static AcceptWaveformOffline? acceptWaveformOffline; @@ -1741,6 +1761,11 @@ class SherpaOnnxBindings { 'SherpaOnnxDestroyOfflineRecognizer') .asFunction(); + offlineRecognizerSetConfig ??= dynamicLibrary + .lookup>( + 'SherpaOnnxOfflineRecognizerSetConfig') + .asFunction(); + createOfflineStream ??= dynamicLibrary .lookup>( 'SherpaOnnxCreateOfflineStream') diff --git a/go-api-examples/non-streaming-canary-decode-files/go.mod b/go-api-examples/non-streaming-canary-decode-files/go.mod new file mode 100644 index 00000000..571f9839 --- /dev/null +++ b/go-api-examples/non-streaming-canary-decode-files/go.mod @@ -0,0 +1,17 @@ +module non-streaming-canary-decode-files + +go 1.17 + +require ( + github.com/k2-fsa/sherpa-onnx-go v1.12.4 + github.com/spf13/pflag v1.0.6 + github.com/youpy/go-wav v0.3.2 +) + +require ( + github.com/k2-fsa/sherpa-onnx-go-linux v1.12.4 // indirect + github.com/k2-fsa/sherpa-onnx-go-macos v1.12.4 // indirect + github.com/k2-fsa/sherpa-onnx-go-windows v1.12.4 // indirect + github.com/youpy/go-riff v0.1.0 // indirect + github.com/zaf/g711 v0.0.0-20190814101024-76a4a538f52b // indirect +) diff --git a/go-api-examples/non-streaming-canary-decode-files/main.go b/go-api-examples/non-streaming-canary-decode-files/main.go new file mode 100644 index 00000000..70c8b0f2 --- /dev/null +++ b/go-api-examples/non-streaming-canary-decode-files/main.go @@ -0,0 +1,113 @@ +package main + +import ( + "bytes" + "encoding/binary" + "log" + "os" + "strings" + + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx" + "github.com/youpy/go-wav" +) + +func main() { + log.SetFlags(log.LstdFlags | log.Lmicroseconds) + + config := sherpa.OfflineRecognizerConfig{} + + config.ModelConfig.Canary.Encoder = "./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx" + config.ModelConfig.Canary.Decoder = "./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/decoder.int8.onnx" + config.ModelConfig.Canary.SrcLang = "en" + config.ModelConfig.Canary.TgtLang = "en" + config.ModelConfig.Canary.UsePnc = 1 + config.ModelConfig.Tokens = "./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/tokens.txt" + + waveFilename := "./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/test_wavs/en.wav" + + samples, sampleRate := readWave(waveFilename) + + log.Println("Initializing recognizer (may take several seconds)") + recognizer := sherpa.NewOfflineRecognizer(&config) + log.Println("Recognizer created!") + defer sherpa.DeleteOfflineRecognizer(recognizer) + + log.Println("Start decoding!") + stream := sherpa.NewOfflineStream(recognizer) + defer sherpa.DeleteOfflineStream(stream) + + stream.AcceptWaveform(sampleRate, samples) + + recognizer.Decode(stream) + log.Println("Decoding done!") + result := stream.GetResult() + + log.Println("Text in English: " + strings.ToLower(result.Text)) + + s := sherpa.NewOfflineStream(recognizer) + defer sherpa.DeleteOfflineStream(s) + + s.AcceptWaveform(sampleRate, samples) + + config.ModelConfig.Canary.TgtLang = "de" + recognizer.SetConfig(&config) + recognizer.Decode(s) + result = s.GetResult() + + log.Println("Text in German: " + strings.ToLower(result.Text)) +} + +func readWave(filename string) (samples []float32, sampleRate int) { + file, _ := os.Open(filename) + defer file.Close() + + reader := wav.NewReader(file) + format, err := reader.Format() + if err != nil { + log.Fatalf("Failed to read wave format") + } + + if format.AudioFormat != 1 { + log.Fatalf("Support only PCM format. Given: %v\n", format.AudioFormat) + } + + if format.NumChannels != 1 { + log.Fatalf("Support only 1 channel wave file. Given: %v\n", format.NumChannels) + } + + if format.BitsPerSample != 16 { + log.Fatalf("Support only 16-bit per sample. Given: %v\n", format.BitsPerSample) + } + + reader.Duration() // so that it initializes reader.Size + + buf := make([]byte, reader.Size) + n, err := reader.Read(buf) + if n != int(reader.Size) { + log.Fatalf("Failed to read %v bytes. Returned %v bytes\n", reader.Size, n) + } + + samples = samplesInt16ToFloat(buf) + sampleRate = int(format.SampleRate) + + return +} + +func samplesInt16ToFloat(inSamples []byte) []float32 { + numSamples := len(inSamples) / 2 + outSamples := make([]float32, numSamples) + + for i := 0; i != numSamples; i++ { + s := inSamples[i*2 : (i+1)*2] + + var s16 int16 + buf := bytes.NewReader(s) + err := binary.Read(buf, binary.LittleEndian, &s16) + if err != nil { + log.Fatal("Failed to parse 16-bit sample") + } + outSamples[i] = float32(s16) / 32768 + } + + return outSamples +} diff --git a/go-api-examples/non-streaming-canary-decode-files/run.sh b/go-api-examples/non-streaming-canary-decode-files/run.sh new file mode 100755 index 00000000..b9e551e4 --- /dev/null +++ b/go-api-examples/non-streaming-canary-decode-files/run.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -f sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +fi + +go mod tidy +go build +./non-streaming-canary-decode-files diff --git a/pascal-api-examples/non-streaming-asr/.gitignore b/pascal-api-examples/non-streaming-asr/.gitignore index 5e1c13eb..fd34dec8 100644 --- a/pascal-api-examples/non-streaming-asr/.gitignore +++ b/pascal-api-examples/non-streaming-asr/.gitignore @@ -10,3 +10,4 @@ telespeech_ctc moonshine dolphin_ctc zipformer_ctc +nemo_canary diff --git a/pascal-api-examples/non-streaming-asr/nemo_canary.pas b/pascal-api-examples/non-streaming-asr/nemo_canary.pas new file mode 100644 index 00000000..9dc7af01 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/nemo_canary.pas @@ -0,0 +1,107 @@ +{ Copyright (c) 2025 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming NeMo Canary model +to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program nemo_canary; + +{$mode objfpc} + +uses + sherpa_onnx, + DateUtils, + SysUtils; + +var + Wave: TSherpaOnnxWave; + WaveFilename: AnsiString; + + Config: TSherpaOnnxOfflineRecognizerConfig; + Recognizer: TSherpaOnnxOfflineRecognizer; + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; + + Start: TDateTime; + Stop: TDateTime; + + Elapsed: Single; + Duration: Single; + RealTimeFactor: Single; +begin + Initialize(Config); + + Config.ModelConfig.Canary.Encoder := './sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx'; + Config.ModelConfig.Canary.Decoder := './sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/decoder.int8.onnx'; + Config.ModelConfig.Canary.SrcLang := 'en'; + Config.ModelConfig.Canary.TgtLang := 'en'; + Config.ModelConfig.Canary.UsePnc := True; + Config.ModelConfig.Tokens := './sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + WaveFilename := './sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/test_wavs/en.wav'; + + Wave := SherpaOnnxReadWave(WaveFilename); + + Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config); + Stream := Recognizer.CreateStream(); + Start := Now; + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + FreeAndNil(Stream); + + WriteLn('-----------Output German-----'); + + Stream := Recognizer.CreateStream(); + Start := Now; + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + + Config.ModelConfig.Canary.TgtLang := 'de'; + Recognizer.SetConfig(Config); + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + {Free resources to avoid memory leak. + + Note: You don't need to invoke them for this simple script. + However, you have to invoke them in your own large/complex project. + } + FreeAndNil(Stream); + FreeAndNil(Recognizer); +end. diff --git a/pascal-api-examples/non-streaming-asr/run-nemo-canary.sh b/pascal-api-examples/non-streaming-asr/run-nemo-canary.sh new file mode 100755 index 00000000..e946e006 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/run-nemo-canary.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + ls -lh lib + popd +fi + +if [ ! -f sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./nemo_canary.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./nemo_canary diff --git a/scripts/dotnet/OfflineCanaryModelConfig.cs b/scripts/dotnet/OfflineCanaryModelConfig.cs new file mode 100644 index 00000000..91839529 --- /dev/null +++ b/scripts/dotnet/OfflineCanaryModelConfig.cs @@ -0,0 +1,32 @@ +/// Copyright (c) 2024.5 by 东风破 + +using System.Runtime.InteropServices; + +namespace SherpaOnnx +{ + [StructLayout(LayoutKind.Sequential)] + public struct OfflineCanaryModelConfig + { + public OfflineCanaryModelConfig() + { + Encoder = ""; + Decoder = ""; + SrcLang = "en"; + TgtLang = "en"; + UsePnc = 1; + } + [MarshalAs(UnmanagedType.LPStr)] + public string Encoder; + + [MarshalAs(UnmanagedType.LPStr)] + public string Decoder; + + [MarshalAs(UnmanagedType.LPStr)] + public string SrcLang; + + [MarshalAs(UnmanagedType.LPStr)] + public string TgtLang; + + public int UsePnc; + } +} diff --git a/scripts/dotnet/OfflineModelConfig.cs b/scripts/dotnet/OfflineModelConfig.cs index 9adf1cda..af5818b5 100644 --- a/scripts/dotnet/OfflineModelConfig.cs +++ b/scripts/dotnet/OfflineModelConfig.cs @@ -28,6 +28,7 @@ namespace SherpaOnnx FireRedAsr = new OfflineFireRedAsrModelConfig(); Dolphin = new OfflineDolphinModelConfig(); ZipformerCtc = new OfflineZipformerCtcModelConfig(); + Canary = new OfflineCanaryModelConfig(); } public OfflineTransducerModelConfig Transducer; public OfflineParaformerModelConfig Paraformer; @@ -62,5 +63,6 @@ namespace SherpaOnnx public OfflineFireRedAsrModelConfig FireRedAsr; public OfflineDolphinModelConfig Dolphin; public OfflineZipformerCtcModelConfig ZipformerCtc; + public OfflineCanaryModelConfig Canary; } } diff --git a/scripts/dotnet/OfflineRecognizer.cs b/scripts/dotnet/OfflineRecognizer.cs index 616c8355..7f43a7e8 100644 --- a/scripts/dotnet/OfflineRecognizer.cs +++ b/scripts/dotnet/OfflineRecognizer.cs @@ -14,6 +14,11 @@ namespace SherpaOnnx _handle = new HandleRef(this, h); } + public void SetConfig(OfflineRecognizerConfig config) + { + SherpaOnnxOfflineRecognizerSetConfig(_handle.Handle, ref config); + } + public OfflineStream CreateStream() { IntPtr p = SherpaOnnxCreateOfflineStream(_handle.Handle); @@ -65,6 +70,9 @@ namespace SherpaOnnx [DllImport(Dll.Filename)] private static extern IntPtr SherpaOnnxCreateOfflineRecognizer(ref OfflineRecognizerConfig config); + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxOfflineRecognizerSetConfig(IntPtr handle, ref OfflineRecognizerConfig config); + [DllImport(Dll.Filename)] private static extern void SherpaOnnxDestroyOfflineRecognizer(IntPtr handle); diff --git a/scripts/go/_internal/non-streaming-canary-decode-files/go.mod b/scripts/go/_internal/non-streaming-canary-decode-files/go.mod new file mode 100644 index 00000000..e8c28ff7 --- /dev/null +++ b/scripts/go/_internal/non-streaming-canary-decode-files/go.mod @@ -0,0 +1,5 @@ +module non-streaming-canary-decode-files + +go 1.17 + +replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../ diff --git a/scripts/go/_internal/non-streaming-canary-decode-files/main.go b/scripts/go/_internal/non-streaming-canary-decode-files/main.go new file mode 120000 index 00000000..0e455900 --- /dev/null +++ b/scripts/go/_internal/non-streaming-canary-decode-files/main.go @@ -0,0 +1 @@ +../../../../go-api-examples/non-streaming-canary-decode-files/main.go \ No newline at end of file diff --git a/scripts/go/_internal/non-streaming-canary-decode-files/run.sh b/scripts/go/_internal/non-streaming-canary-decode-files/run.sh new file mode 120000 index 00000000..f1508571 --- /dev/null +++ b/scripts/go/_internal/non-streaming-canary-decode-files/run.sh @@ -0,0 +1 @@ +../../../../go-api-examples/non-streaming-canary-decode-files/run.sh \ No newline at end of file diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index a055d26b..5358f85c 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -414,6 +414,14 @@ type OfflineWhisperModelConfig struct { TailPaddings int } +type OfflineCanaryModelConfig struct { + Encoder string + Decoder string + SrcLang string + TgtLang string + UsePnc int +} + type OfflineFireRedAsrModelConfig struct { Encoder string Decoder string @@ -453,6 +461,7 @@ type OfflineModelConfig struct { FireRedAsr OfflineFireRedAsrModelConfig Dolphin OfflineDolphinModelConfig ZipformerCtc OfflineZipformerCtcModelConfig + Canary OfflineCanaryModelConfig Tokens string // Path to tokens.txt // Number of threads to use for neural network computation @@ -547,6 +556,12 @@ func newCOfflineRecognizerConfig(config *OfflineRecognizerConfig) *C.struct_Sher c.model_config.dolphin.model = C.CString(config.ModelConfig.Dolphin.Model) c.model_config.zipformer_ctc.model = C.CString(config.ModelConfig.ZipformerCtc.Model) + c.model_config.canary.encoder = C.CString(config.ModelConfig.Canary.Encoder) + c.model_config.canary.decoder = C.CString(config.ModelConfig.Canary.Decoder) + c.model_config.canary.src_lang = C.CString(config.ModelConfig.Canary.SrcLang) + c.model_config.canary.tgt_lang = C.CString(config.ModelConfig.Canary.TgtLang) + c.model_config.canary.use_pnc = C.int(config.ModelConfig.Canary.UsePnc) + c.model_config.tokens = C.CString(config.ModelConfig.Tokens) c.model_config.num_threads = C.int(config.ModelConfig.NumThreads) @@ -675,6 +690,26 @@ func freeCOfflineRecognizerConfig(c *C.struct_SherpaOnnxOfflineRecognizerConfig) c.model_config.zipformer_ctc.model = nil } + if c.model_config.canary.encoder != nil { + C.free(unsafe.Pointer(c.model_config.canary.encoder)) + c.model_config.canary.encoder = nil + } + + if c.model_config.canary.decoder != nil { + C.free(unsafe.Pointer(c.model_config.canary.decoder)) + c.model_config.canary.decoder = nil + } + + if c.model_config.canary.src_lang != nil { + C.free(unsafe.Pointer(c.model_config.canary.src_lang)) + c.model_config.canary.src_lang = nil + } + + if c.model_config.canary.tgt_lang != nil { + C.free(unsafe.Pointer(c.model_config.canary.tgt_lang)) + c.model_config.canary.tgt_lang = nil + } + if c.model_config.tokens != nil { C.free(unsafe.Pointer(c.model_config.tokens)) c.model_config.tokens = nil diff --git a/sherpa-onnx/csrc/online-transducer-nemo-model.cc b/sherpa-onnx/csrc/online-transducer-nemo-model.cc index a656b74a..53411f8e 100644 --- a/sherpa-onnx/csrc/online-transducer-nemo-model.cc +++ b/sherpa-onnx/csrc/online-transducer-nemo-model.cc @@ -323,7 +323,8 @@ class OnlineTransducerNeMoModel::Impl { SHERPA_ONNX_READ_META_DATA(window_size_, "window_size"); SHERPA_ONNX_READ_META_DATA(chunk_shift_, "chunk_shift"); SHERPA_ONNX_READ_META_DATA(subsampling_factor_, "subsampling_factor"); - SHERPA_ONNX_READ_META_DATA_STR(normalize_type_, "normalize_type"); + SHERPA_ONNX_READ_META_DATA_STR_ALLOW_EMPTY(normalize_type_, + "normalize_type"); SHERPA_ONNX_READ_META_DATA(pred_rnn_layers_, "pred_rnn_layers"); SHERPA_ONNX_READ_META_DATA(pred_hidden_, "pred_hidden"); diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index bf5e2ea7..d7f82409 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -299,6 +299,16 @@ type class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig); end; + TSherpaOnnxOfflineCanaryModelConfig = record + Encoder: AnsiString; + Decoder: AnsiString; + SrcLang: AnsiString; + TgtLang: AnsiString; + UsePnc: Boolean; + function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineCanaryModelConfig); + end; + TSherpaOnnxOfflineMoonshineModelConfig = record Preprocessor: AnsiString; Encoder: AnsiString; @@ -352,6 +362,7 @@ type FireRedAsr: TSherpaOnnxOfflineFireRedAsrModelConfig; Dolphin: TSherpaOnnxOfflineDolphinModelConfig; ZipformerCtc: TSherpaOnnxOfflineZipformerCtcModelConfig; + Canary: TSherpaOnnxOfflineCanaryModelConfig; class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig); function ToString: AnsiString; end; @@ -398,6 +409,7 @@ type destructor Destroy; override; function CreateStream: TSherpaOnnxOfflineStream; procedure Decode(Stream: TSherpaOnnxOfflineStream); + procedure SetConfig(Config: TSherpaOnnxOfflineRecognizerConfig); function GetResult(Stream: TSherpaOnnxOfflineStream): TSherpaOnnxOfflineRecognizerResult; property Config: TSherpaOnnxOfflineRecognizerConfig Read _Config; property GetHandle: Pointer Read Handle; @@ -742,6 +754,13 @@ type Task: PAnsiChar; TailPaddings: cint32; end; + SherpaOnnxOfflineCanaryModelConfig = record + Encoder: PAnsiChar; + Decoder: PAnsiChar; + SrcLang: PAnsiChar; + TgtLang: PAnsiChar; + UsePnc: cint32; + end; SherpaOnnxOfflineFireRedAsrModelConfig = record Encoder: PAnsiChar; Decoder: PAnsiChar; @@ -783,6 +802,7 @@ type FireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig; Dolphin: SherpaOnnxOfflineDolphinModelConfig; ZipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig; + Canary: SherpaOnnxOfflineCanaryModelConfig; end; SherpaOnnxOfflineRecognizerConfig = record @@ -1197,6 +1217,9 @@ procedure SherpaOnnxAcceptWaveformOffline(Stream: Pointer; procedure SherpaOnnxDecodeOfflineStream(Recognizer: Pointer; Stream: Pointer); cdecl; external SherpaOnnxLibName; +procedure SherpaOnnxOfflineRecognizerSetConfig(Recognizer: Pointer; Config: PSherpaOnnxOfflineRecognizerConfig); cdecl; + external SherpaOnnxLibName; + function SherpaOnnxGetOfflineStreamResultAsJson(Stream: Pointer): PAnsiChar; cdecl; external SherpaOnnxLibName; @@ -1564,6 +1587,19 @@ begin [Self.Encoder, Self.Decoder, Self.Language, Self.Task, Self.TailPaddings]); end; +function TSherpaOnnxOfflineCanaryModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineCanaryModelConfig(' + + 'Encoder := %s, ' + + 'Decoder := %s, ' + + 'SrcLang := %s, ' + + 'TgtLang := %s, ' + + 'UsePnc := %s' + + ')', + [Self.Encoder, Self.Decoder, Self.SrcLang, + Self.TgtLang, Self.UsePnc.ToString]); +end; + function TSherpaOnnxOfflineFireRedAsrModelConfig.ToString: AnsiString; begin Result := Format('TSherpaOnnxOfflineFireRedAsrModelConfig(' + @@ -1627,14 +1663,16 @@ begin 'Moonshine := %s, ' + 'FireRedAsr := %s, ' + 'Dolphin := %s, ' + - 'ZipformerCtc := %s' + + 'ZipformerCtc := %s, ' + + 'Canary := %s' + ')', [Self.Transducer.ToString, Self.Paraformer.ToString, Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString, Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider, Self.ModelType, Self.ModelingUnit, Self.BpeVocab, Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString, - Self.FireRedAsr.ToString, Self.Dolphin.ToString, Self.ZipformerCtc.ToString + Self.FireRedAsr.ToString, Self.Dolphin.ToString, + Self.ZipformerCtc.ToString, Self.Canary.ToString ]); end; @@ -1660,7 +1698,7 @@ begin ]); end; -constructor TSherpaOnnxOfflineRecognizer.Create(Config: TSherpaOnnxOfflineRecognizerConfig); +function ConvertOfflineRecognizerConfig(Config: TSherpaOnnxOfflineRecognizerConfig): SherpaOnnxOfflineRecognizerConfig; var C: SherpaOnnxOfflineRecognizerConfig; begin @@ -1707,6 +1745,12 @@ begin C.ModelConfig.Dolphin.Model := PAnsiChar(Config.ModelConfig.Dolphin.Model); C.ModelConfig.ZipformerCtc.Model := PAnsiChar(Config.ModelConfig.ZipformerCtc.Model); + C.ModelConfig.Canary.Encoder := PAnsiChar(Config.ModelConfig.Canary.Encoder); + C.ModelConfig.Canary.Decoder := PAnsiChar(Config.ModelConfig.Canary.Decoder); + C.ModelConfig.Canary.SrcLang := PAnsiChar(Config.ModelConfig.Canary.SrcLang); + C.ModelConfig.Canary.TgtLang := PAnsiChar(Config.ModelConfig.Canary.TgtLang); + C.ModelConfig.Canary.UsePnc := Ord(Config.ModelConfig.Canary.UsePnc); + C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model); C.LMConfig.Scale := Config.LMConfig.Scale; @@ -1722,10 +1766,27 @@ begin C.Hr.Lexicon := PAnsiChar(Config.Hr.Lexicon); C.Hr.RuleFsts := PAnsiChar(Config.Hr.RuleFsts); + Result := C; +end; + +constructor TSherpaOnnxOfflineRecognizer.Create(Config: TSherpaOnnxOfflineRecognizerConfig); +var + C: SherpaOnnxOfflineRecognizerConfig; +begin + C := ConvertOfflineRecognizerConfig(Config); Self.Handle := SherpaOnnxCreateOfflineRecognizer(@C); Self._Config := Config; end; +procedure TSherpaOnnxOfflineRecognizer.SetConfig(Config: TSherpaOnnxOfflineRecognizerConfig); +var + C: SherpaOnnxOfflineRecognizerConfig; +begin + C := ConvertOfflineRecognizerConfig(Config); + SherpaOnnxOfflineRecognizerSetConfig(Self.Handle, @C); + { We don't update Self._Config } +end; + destructor TSherpaOnnxOfflineRecognizer.Destroy; begin SherpaOnnxDestroyOfflineRecognizer(Self.Handle); @@ -1912,6 +1973,13 @@ begin Dest.TailPaddings := -1; end; +class operator TSherpaOnnxOfflineCanaryModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineCanaryModelConfig); +begin + Dest.SrcLang := 'en'; + Dest.TgtLang := 'en'; + Dest.UsePnc := True; +end; + class operator TSherpaOnnxOfflineLMConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineLMConfig); begin Dest.Scale := 1.0;