From 07a5701af6c458eb0082fbe8be0e824c210b3b5c Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 3 Apr 2025 15:59:38 +0800 Subject: [PATCH] Add Dart API for Dolphin CTC models (#2095) --- .github/scripts/test-dart.sh | 9 ++ dart-api-examples/non-streaming-asr/README.md | 1 + .../non-streaming-asr/bin/dolphin-ctc.dart | 52 ++++++++ .../non-streaming-asr/run-dolphin-ctc.sh | 18 +++ .../bin/dolphin-ctc.dart | 118 ++++++++++++++++++ .../run-dolphin-ctc.sh | 27 ++++ .../lib/src/offline_recognizer.dart | 33 ++++- .../lib/src/sherpa_onnx_bindings.dart | 5 + 8 files changed, 262 insertions(+), 1 deletion(-) create mode 100644 dart-api-examples/non-streaming-asr/bin/dolphin-ctc.dart create mode 100755 dart-api-examples/non-streaming-asr/run-dolphin-ctc.sh create mode 100644 dart-api-examples/vad-with-non-streaming-asr/bin/dolphin-ctc.dart create mode 100755 dart-api-examples/vad-with-non-streaming-asr/run-dolphin-ctc.sh diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh index 65da3be4..afd24408 100755 --- a/.github/scripts/test-dart.sh +++ b/.github/scripts/test-dart.sh @@ -61,6 +61,11 @@ echo '----------ced----------' popd pushd vad-with-non-streaming-asr + +echo '----------Dolphin CTC----------' +./run-dolphin-ctc.sh +rm -rf sherpa-onnx-* + echo '----------TeleSpeech CTC----------' ./run-telespeech-ctc.sh rm -rf sherpa-onnx-* @@ -110,6 +115,10 @@ echo '----------NeMo transducer----------' ./run-nemo-transducer.sh rm -rf sherpa-onnx-* +echo '----------Dolphin CTC----------' +./run-dolphin-ctc.sh +rm -rf sherpa-onnx-* + echo '----------NeMo CTC----------' ./run-nemo-ctc.sh rm -rf sherpa-onnx-* diff --git a/dart-api-examples/non-streaming-asr/README.md b/dart-api-examples/non-streaming-asr/README.md index e897d3e9..7f5f49f2 100644 --- a/dart-api-examples/non-streaming-asr/README.md +++ b/dart-api-examples/non-streaming-asr/README.md @@ -4,6 +4,7 @@ This folder contains examples for non-streaming ASR with Dart API. | File | Description| |------|------------| +|[./bin/dolphin-ctc.dart](./bin/dolphin-ctc.dart)| Use a [Dolphin](https://github.com/DataoceanAI/Dolphin) Ctc model for speech recognition. See [./run-dolphin-ctc.sh](./run-dolphin-ctc.sh)| |[./bin/nemo-ctc.dart](./bin/nemo-ctc.dart)| Use a NeMo Ctc model for speech recognition. See [./run-nemo-ctc.sh](./run-nemo-ctc.sh)| |[./bin/nemo-transducer.dart](./bin/nemo-transducer.dart)| Use a NeMo transducer model for speech recognition. See [./run-nemo-transducer.sh](./run-nemo-transducer.sh)| |[./bin/paraformer.dart](./bin/paraformer.dart)|Use a paraformer model for speech recognition. See [./run-paraformer.sh](./run-paraformer.sh)| diff --git a/dart-api-examples/non-streaming-asr/bin/dolphin-ctc.dart b/dart-api-examples/non-streaming-asr/bin/dolphin-ctc.dart new file mode 100644 index 00000000..685d3b2f --- /dev/null +++ b/dart-api-examples/non-streaming-asr/bin/dolphin-ctc.dart @@ -0,0 +1,52 @@ +// Copyright (c) 2025 Xiaomi Corporation +import 'dart:io'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('model', help: 'Path to the Dolphin CTC model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['model'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final model = res['model'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final dolphin = sherpa_onnx.OfflineDolphinModelConfig(model: model); + + final modelConfig = sherpa_onnx.OfflineModelConfig( + dolphin: dolphin, + tokens: tokens, + debug: true, + numThreads: 1, + ); + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OfflineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + stream.acceptWaveform( + samples: waveData.samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + print(result.text); + + stream.free(); + recognizer.free(); +} diff --git a/dart-api-examples/non-streaming-asr/run-dolphin-ctc.sh b/dart-api-examples/non-streaming-asr/run-dolphin-ctc.sh new file mode 100755 index 00000000..b56d6c1f --- /dev/null +++ b/dart-api-examples/non-streaming-asr/run-dolphin-ctc.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 + tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 + rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 + ls -lh sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02 +fi + +dart run \ + ./bin/dolphin-ctc.dart \ + --model ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx \ + --tokens ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt \ + --input-wav ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav diff --git a/dart-api-examples/vad-with-non-streaming-asr/bin/dolphin-ctc.dart b/dart-api-examples/vad-with-non-streaming-asr/bin/dolphin-ctc.dart new file mode 100644 index 00000000..bb84d25e --- /dev/null +++ b/dart-api-examples/vad-with-non-streaming-asr/bin/dolphin-ctc.dart @@ -0,0 +1,118 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('silero-vad', help: 'Path to silero_vad.onnx') + ..addOption('model', help: 'Path to the Dolphin CTC model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['silero-vad'] == null || + res['model'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + // create VAD + final sileroVad = res['silero-vad'] as String; + + final sileroVadConfig = sherpa_onnx.SileroVadModelConfig( + model: sileroVad, + minSilenceDuration: 0.25, + minSpeechDuration: 0.5, + maxSpeechDuration: 5.0, + ); + + final vadConfig = sherpa_onnx.VadModelConfig( + sileroVad: sileroVadConfig, + numThreads: 1, + debug: true, + ); + + final vad = sherpa_onnx.VoiceActivityDetector( + config: vadConfig, bufferSizeInSeconds: 10); + + // create offline recognizer + final model = res['model'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final dolphin = sherpa_onnx.OfflineDolphinModelConfig(model: model); + + final modelConfig = sherpa_onnx.OfflineModelConfig( + dolphin: dolphin, + tokens: tokens, + debug: true, + numThreads: 1, + ); + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OfflineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + if (waveData.sampleRate != 16000) { + print('Only 16000 Hz is supported. Given: ${waveData.sampleRate}'); + exit(1); + } + + int numSamples = waveData.samples.length; + int numIter = numSamples ~/ vadConfig.sileroVad.windowSize; + + for (int i = 0; i != numIter; ++i) { + int start = i * vadConfig.sileroVad.windowSize; + vad.acceptWaveform(Float32List.sublistView( + waveData.samples, start, start + vadConfig.sileroVad.windowSize)); + + while (!vad.isEmpty()) { + final samples = vad.front().samples; + final startTime = vad.front().start.toDouble() / waveData.sampleRate; + final endTime = + startTime + samples.length.toDouble() / waveData.sampleRate; + + final stream = recognizer.createStream(); + stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + stream.free(); + print( + '${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}'); + + vad.pop(); + } + } + + vad.flush(); + + while (!vad.isEmpty()) { + final samples = vad.front().samples; + final startTime = vad.front().start.toDouble() / waveData.sampleRate; + final endTime = startTime + samples.length.toDouble() / waveData.sampleRate; + + final stream = recognizer.createStream(); + stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + stream.free(); + print( + '${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}'); + + vad.pop(); + } + + vad.free(); + + recognizer.free(); +} diff --git a/dart-api-examples/vad-with-non-streaming-asr/run-dolphin-ctc.sh b/dart-api-examples/vad-with-non-streaming-asr/run-dolphin-ctc.sh new file mode 100755 index 00000000..74ecae97 --- /dev/null +++ b/dart-api-examples/vad-with-non-streaming-asr/run-dolphin-ctc.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 + tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 + rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 + ls -lh sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02 +fi + +if [ ! -f ./lei-jun-test.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +fi + +if [[ ! -f ./silero_vad.onnx ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +fi + +dart run \ + ./bin/dolphin-ctc.dart \ + --silero-vad ./silero_vad.onnx \ + --model ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx \ + --tokens ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt \ + --input-wav ./lei-jun-test.wav diff --git a/flutter/sherpa_onnx/lib/src/offline_recognizer.dart b/flutter/sherpa_onnx/lib/src/offline_recognizer.dart index e7bbaf8c..ee704d6d 100644 --- a/flutter/sherpa_onnx/lib/src/offline_recognizer.dart +++ b/flutter/sherpa_onnx/lib/src/offline_recognizer.dart @@ -82,6 +82,27 @@ class OfflineNemoEncDecCtcModelConfig { final String model; } +class OfflineDolphinModelConfig { + const OfflineDolphinModelConfig({this.model = ''}); + + factory OfflineDolphinModelConfig.fromJson(Map json) { + return OfflineDolphinModelConfig( + model: json['model'] as String? ?? '', + ); + } + + @override + String toString() { + return 'OfflineDolphinModelConfig(model: $model)'; + } + + Map toJson() => { + 'model': model, + }; + + final String model; +} + class OfflineWhisperModelConfig { const OfflineWhisperModelConfig( {this.encoder = '', @@ -265,6 +286,7 @@ class OfflineModelConfig { this.senseVoice = const OfflineSenseVoiceModelConfig(), this.moonshine = const OfflineMoonshineModelConfig(), this.fireRedAsr = const OfflineFireRedAsrModelConfig(), + this.dolphin = const OfflineDolphinModelConfig(), required this.tokens, this.numThreads = 1, this.debug = true, @@ -309,6 +331,10 @@ class OfflineModelConfig { ? OfflineFireRedAsrModelConfig.fromJson( json['fireRedAsr'] as Map) : const OfflineFireRedAsrModelConfig(), + dolphin: json['dolphin'] != null + ? OfflineDolphinModelConfig.fromJson( + json['dolphin'] as Map) + : const OfflineDolphinModelConfig(), tokens: json['tokens'] as String, numThreads: json['numThreads'] as int? ?? 1, debug: json['debug'] as bool? ?? true, @@ -322,7 +348,7 @@ class OfflineModelConfig { @override String toString() { - return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)'; + return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, dolphin: $dolphin, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)'; } Map toJson() => { @@ -334,6 +360,7 @@ class OfflineModelConfig { 'senseVoice': senseVoice.toJson(), 'moonshine': moonshine.toJson(), 'fireRedAsr': fireRedAsr.toJson(), + 'dolphin': dolphin.toJson(), 'tokens': tokens, 'numThreads': numThreads, 'debug': debug, @@ -352,6 +379,7 @@ class OfflineModelConfig { final OfflineSenseVoiceModelConfig senseVoice; final OfflineMoonshineModelConfig moonshine; final OfflineFireRedAsrModelConfig fireRedAsr; + final OfflineDolphinModelConfig dolphin; final String tokens; final int numThreads; @@ -544,6 +572,8 @@ class OfflineRecognizer { c.ref.model.fireRedAsr.decoder = config.model.fireRedAsr.decoder.toNativeUtf8(); + c.ref.model.dolphin.model = config.model.dolphin.model.toNativeUtf8(); + c.ref.model.tokens = config.model.tokens.toNativeUtf8(); c.ref.model.numThreads = config.model.numThreads; @@ -581,6 +611,7 @@ class OfflineRecognizer { calloc.free(c.ref.model.modelType); calloc.free(c.ref.model.provider); calloc.free(c.ref.model.tokens); + calloc.free(c.ref.model.dolphin.model); calloc.free(c.ref.model.fireRedAsr.decoder); calloc.free(c.ref.model.fireRedAsr.encoder); calloc.free(c.ref.model.moonshine.cachedDecoder); diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart index 09550469..52e26307 100644 --- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart +++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart @@ -261,6 +261,10 @@ final class SherpaOnnxOfflineNemoEncDecCtcModelConfig extends Struct { external Pointer model; } +final class SherpaOnnxOfflineDolphinModelConfig extends Struct { + external Pointer model; +} + final class SherpaOnnxOfflineWhisperModelConfig extends Struct { external Pointer encoder; external Pointer decoder; @@ -327,6 +331,7 @@ final class SherpaOnnxOfflineModelConfig extends Struct { external SherpaOnnxOfflineSenseVoiceModelConfig senseVoice; external SherpaOnnxOfflineMoonshineModelConfig moonshine; external SherpaOnnxOfflineFireRedAsrModelConfig fireRedAsr; + external SherpaOnnxOfflineDolphinModelConfig dolphin; } final class SherpaOnnxOfflineRecognizerConfig extends Struct {