From 85df96d5280fdee3106c08f413fd7189a244e2b7 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 30 Apr 2025 23:15:28 +0800 Subject: [PATCH] Add Dart API for homophone replacer (#2167) --- .github/scripts/test-dart.sh | 103 +++++++++--------- .../bin/sense-voice-with-hr.dart | 78 +++++++++++++ .../run-sense-voice-with-hr.sh | 31 ++++++ flutter/sherpa_onnx/lib/sherpa_onnx.dart | 1 + .../lib/src/homophone_replacer_config.dart | 29 +++++ .../lib/src/offline_recognizer.dart | 14 ++- .../lib/src/online_recognizer.dart | 15 ++- .../lib/src/sherpa_onnx_bindings.dart | 8 ++ 8 files changed, 226 insertions(+), 53 deletions(-) create mode 100644 dart-api-examples/non-streaming-asr/bin/sense-voice-with-hr.dart create mode 100755 dart-api-examples/non-streaming-asr/run-sense-voice-with-hr.sh create mode 100644 flutter/sherpa_onnx/lib/src/homophone_replacer_config.dart diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh index afd24408..70ff5230 100755 --- a/.github/scripts/test-dart.sh +++ b/.github/scripts/test-dart.sh @@ -4,6 +4,58 @@ set -ex cd dart-api-examples +pushd non-streaming-asr + +echo '----------SenseVoice----------' +./run-sense-voice-with-hr.sh +./run-sense-voice.sh +rm -rf sherpa-onnx-* + +echo '----------FireRedAsr----------' +./run-fire-red-asr.sh +rm -rf sherpa-onnx-fire-red-asr-* + +echo '----------NeMo transducer----------' +./run-nemo-transducer.sh +rm -rf sherpa-onnx-* + +echo '----------Dolphin CTC----------' +./run-dolphin-ctc.sh +rm -rf sherpa-onnx-* + +echo '----------NeMo CTC----------' +./run-nemo-ctc.sh +rm -rf sherpa-onnx-* + +echo '----------TeleSpeech CTC----------' +./run-telespeech-ctc.sh +rm -rf sherpa-onnx-* + +echo '----------moonshine----------' +./run-moonshine.sh +rm -rf sherpa-onnx-* + +echo '----------whisper----------' +./run-whisper.sh +rm -rf sherpa-onnx-* + +echo '----------zipformer transducer----------' +./run-zipformer-transducer.sh +rm -rf sherpa-onnx-* + +echo '----------paraformer itn----------' +./run-paraformer-itn.sh + +echo '----------paraformer----------' +./run-paraformer.sh +rm -rf sherpa-onnx-* + +echo '----------VAD with paraformer----------' +./run-vad-with-paraformer.sh +rm -rf sherpa-onnx-* + +popd # non-streaming-asr + pushd speech-enhancement-gtcrn echo "speech enhancement with gtcrn models" ./run.sh @@ -101,57 +153,6 @@ pushd keyword-spotter ./run-zh.sh popd -pushd non-streaming-asr - -echo '----------FireRedAsr----------' -./run-fire-red-asr.sh -rm -rf sherpa-onnx-fire-red-asr-* - -echo '----------SenseVoice----------' -./run-sense-voice.sh -rm -rf sherpa-onnx-* - -echo '----------NeMo transducer----------' -./run-nemo-transducer.sh -rm -rf sherpa-onnx-* - -echo '----------Dolphin CTC----------' -./run-dolphin-ctc.sh -rm -rf sherpa-onnx-* - -echo '----------NeMo CTC----------' -./run-nemo-ctc.sh -rm -rf sherpa-onnx-* - -echo '----------TeleSpeech CTC----------' -./run-telespeech-ctc.sh -rm -rf sherpa-onnx-* - -echo '----------moonshine----------' -./run-moonshine.sh -rm -rf sherpa-onnx-* - -echo '----------whisper----------' -./run-whisper.sh -rm -rf sherpa-onnx-* - -echo '----------zipformer transducer----------' -./run-zipformer-transducer.sh -rm -rf sherpa-onnx-* - -echo '----------paraformer itn----------' -./run-paraformer-itn.sh - -echo '----------paraformer----------' -./run-paraformer.sh -rm -rf sherpa-onnx-* - -echo '----------VAD with paraformer----------' -./run-vad-with-paraformer.sh -rm -rf sherpa-onnx-* - -popd # non-streaming-asr - pushd streaming-asr echo '----------streaming zipformer ctc HLG----------' diff --git a/dart-api-examples/non-streaming-asr/bin/sense-voice-with-hr.dart b/dart-api-examples/non-streaming-asr/bin/sense-voice-with-hr.dart new file mode 100644 index 00000000..85a80739 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/bin/sense-voice-with-hr.dart @@ -0,0 +1,78 @@ +// Copyright (c) 2025 Xiaomi Corporation +import 'dart:io'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('model', help: 'Path to the SenseVoice model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('language', + help: 'auto, zh, en, ja, ko, yue, or leave it empty to use auto', + defaultsTo: '') + ..addOption('use-itn', + help: 'true to use inverse text normalization', defaultsTo: 'false') + ..addOption('input-wav', help: 'Path to input.wav to transcribe') + ..addOption('hr-dict-dir', + help: 'Path to jieba dict for homophone replacer') + ..addOption('hr-lexicon', + help: 'Path to lexicon.txt for homophone replacer') + ..addOption('hr-rule-fsts', + help: 'Path to replace.fst for homophone replacer'); + + final res = parser.parse(arguments); + if (res['model'] == null || + res['tokens'] == null || + res['hr-dict-dir'] == null || + res['hr-lexicon'] == null || + res['hr-rule-fsts'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final model = res['model'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + final language = res['language'] as String; + final useItn = (res['use-itn'] as String).toLowerCase() == 'true'; + final hrDictDir = res['hr-dict-dir'] as String; + final hrLexicon = res['hr-lexicon'] as String; + final hrRuleFsts = res['hr-rule-fsts'] as String; + + final senseVoice = sherpa_onnx.OfflineSenseVoiceModelConfig( + model: model, language: language, useInverseTextNormalization: useItn); + + final modelConfig = sherpa_onnx.OfflineModelConfig( + senseVoice: senseVoice, + tokens: tokens, + debug: true, + numThreads: 1, + ); + + final hr = sherpa_onnx.HomophoneReplacerConfig( + dictDir: hrDictDir, lexicon: hrLexicon, ruleFsts: hrRuleFsts); + + final config = + sherpa_onnx.OfflineRecognizerConfig(model: modelConfig, hr: hr); + + final recognizer = sherpa_onnx.OfflineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + stream.acceptWaveform( + samples: waveData.samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + print(result.text); + + stream.free(); + recognizer.free(); +} diff --git a/dart-api-examples/non-streaming-asr/run-sense-voice-with-hr.sh b/dart-api-examples/non-streaming-asr/run-sense-voice-with-hr.sh new file mode 100755 index 00000000..a194541b --- /dev/null +++ b/dart-api-examples/non-streaming-asr/run-sense-voice-with-hr.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +fi + +if [ ! -d dict ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/dict.tar.bz2 + tar xf dict.tar.bz2 + rm dict.tar.bz2 + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/replace.fst + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/test-hr.wav + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/lexicon.txt +fi + +dart run \ + ./bin/sense-voice-with-hr.dart \ + --model ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx \ + --tokens ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \ + --use-itn true \ + --hr-dict-dir ./dict \ + --hr-lexicon ./lexicon.txt \ + --hr-rule-fsts ./replace.fst \ + --input-wav ./test-hr.wav diff --git a/flutter/sherpa_onnx/lib/sherpa_onnx.dart b/flutter/sherpa_onnx/lib/sherpa_onnx.dart index 284360e7..ba2044f3 100644 --- a/flutter/sherpa_onnx/lib/sherpa_onnx.dart +++ b/flutter/sherpa_onnx/lib/sherpa_onnx.dart @@ -4,6 +4,7 @@ import 'dart:ffi'; export 'src/audio_tagging.dart'; export 'src/feature_config.dart'; +export 'src/homophone_replacer_config.dart'; export 'src/keyword_spotter.dart'; export 'src/offline_punctuation.dart'; export 'src/offline_recognizer.dart'; diff --git a/flutter/sherpa_onnx/lib/src/homophone_replacer_config.dart b/flutter/sherpa_onnx/lib/src/homophone_replacer_config.dart new file mode 100644 index 00000000..e7515a6d --- /dev/null +++ b/flutter/sherpa_onnx/lib/src/homophone_replacer_config.dart @@ -0,0 +1,29 @@ +// Copyright (c) 2025 Xiaomi Corporation + +class HomophoneReplacerConfig { + const HomophoneReplacerConfig( + {this.dictDir = '', this.lexicon = '', this.ruleFsts = ''}); + + factory HomophoneReplacerConfig.fromJson(Map json) { + return HomophoneReplacerConfig( + dictDir: json['dictDir'] as String? ?? '', + lexicon: json['lexicon'] as String? ?? '', + ruleFsts: json['ruleFsts'] as String? ?? '', + ); + } + + @override + String toString() { + return 'HomophoneReplacerConfig(dictDir: $dictDir, lexicon: $lexicon, ruleFsts: $ruleFsts)'; + } + + Map toJson() => { + 'dictDir': dictDir, + 'lexicon': lexicon, + 'ruleFsts': ruleFsts, + }; + + final String dictDir; + final String lexicon; + final String ruleFsts; +} diff --git a/flutter/sherpa_onnx/lib/src/offline_recognizer.dart b/flutter/sherpa_onnx/lib/src/offline_recognizer.dart index ee704d6d..5e368ae0 100644 --- a/flutter/sherpa_onnx/lib/src/offline_recognizer.dart +++ b/flutter/sherpa_onnx/lib/src/offline_recognizer.dart @@ -5,6 +5,7 @@ import 'dart:ffi'; import 'package:ffi/ffi.dart'; import './feature_config.dart'; +import './homophone_replacer_config.dart'; import './offline_stream.dart'; import './sherpa_onnx_bindings.dart'; import './utils.dart'; @@ -403,6 +404,7 @@ class OfflineRecognizerConfig { this.ruleFsts = '', this.ruleFars = '', this.blankPenalty = 0.0, + this.hr = const HomophoneReplacerConfig(), }); factory OfflineRecognizerConfig.fromJson(Map json) { @@ -421,12 +423,13 @@ class OfflineRecognizerConfig { ruleFsts: json['ruleFsts'] as String? ?? '', ruleFars: json['ruleFars'] as String? ?? '', blankPenalty: (json['blankPenalty'] as num?)?.toDouble() ?? 0.0, + hr: HomophoneReplacerConfig.fromJson(json['hr'] as Map), ); } @override String toString() { - return 'OfflineRecognizerConfig(feat: $feat, model: $model, lm: $lm, decodingMethod: $decodingMethod, maxActivePaths: $maxActivePaths, hotwordsFile: $hotwordsFile, hotwordsScore: $hotwordsScore, ruleFsts: $ruleFsts, ruleFars: $ruleFars, blankPenalty: $blankPenalty)'; + return 'OfflineRecognizerConfig(feat: $feat, model: $model, lm: $lm, decodingMethod: $decodingMethod, maxActivePaths: $maxActivePaths, hotwordsFile: $hotwordsFile, hotwordsScore: $hotwordsScore, ruleFsts: $ruleFsts, ruleFars: $ruleFars, blankPenalty: $blankPenalty, hr: $hr)'; } Map toJson() => { @@ -440,6 +443,7 @@ class OfflineRecognizerConfig { 'ruleFsts': ruleFsts, 'ruleFars': ruleFars, 'blankPenalty': blankPenalty, + 'hr': hr.toJson(), }; final FeatureConfig feat; @@ -457,6 +461,7 @@ class OfflineRecognizerConfig { final String ruleFars; final double blankPenalty; + final HomophoneReplacerConfig hr; } class OfflineRecognizerResult { @@ -598,8 +603,15 @@ class OfflineRecognizer { c.ref.blankPenalty = config.blankPenalty; + c.ref.hr.dictDir = config.hr.dictDir.toNativeUtf8(); + c.ref.hr.lexicon = config.hr.lexicon.toNativeUtf8(); + c.ref.hr.ruleFsts = config.hr.ruleFsts.toNativeUtf8(); + final ptr = SherpaOnnxBindings.createOfflineRecognizer?.call(c) ?? nullptr; + calloc.free(c.ref.hr.dictDir); + calloc.free(c.ref.hr.lexicon); + calloc.free(c.ref.hr.ruleFsts); calloc.free(c.ref.ruleFars); calloc.free(c.ref.ruleFsts); calloc.free(c.ref.hotwordsFile); diff --git a/flutter/sherpa_onnx/lib/src/online_recognizer.dart b/flutter/sherpa_onnx/lib/src/online_recognizer.dart index 4a4959c9..69cc4e80 100644 --- a/flutter/sherpa_onnx/lib/src/online_recognizer.dart +++ b/flutter/sherpa_onnx/lib/src/online_recognizer.dart @@ -5,6 +5,7 @@ import 'dart:ffi'; import 'package:ffi/ffi.dart'; import './feature_config.dart'; +import './homophone_replacer_config.dart'; import './online_stream.dart'; import './sherpa_onnx_bindings.dart'; import './utils.dart'; @@ -194,6 +195,7 @@ class OnlineRecognizerConfig { this.ruleFsts = '', this.ruleFars = '', this.blankPenalty = 0.0, + this.hr = const HomophoneReplacerConfig(), }); factory OnlineRecognizerConfig.fromJson(Map json) { @@ -217,12 +219,14 @@ class OnlineRecognizerConfig { ruleFsts: json['ruleFsts'] as String? ?? '', ruleFars: json['ruleFars'] as String? ?? '', blankPenalty: (json['blankPenalty'] as num?)?.toDouble() ?? 0.0, + hr: HomophoneReplacerConfig.fromJson( + json['hr'] as Map? ?? const {}), ); } @override String toString() { - return 'OnlineRecognizerConfig(feat: $feat, model: $model, decodingMethod: $decodingMethod, maxActivePaths: $maxActivePaths, enableEndpoint: $enableEndpoint, rule1MinTrailingSilence: $rule1MinTrailingSilence, rule2MinTrailingSilence: $rule2MinTrailingSilence, rule3MinUtteranceLength: $rule3MinUtteranceLength, hotwordsFile: $hotwordsFile, hotwordsScore: $hotwordsScore, ctcFstDecoderConfig: $ctcFstDecoderConfig, ruleFsts: $ruleFsts, ruleFars: $ruleFars, blankPenalty: $blankPenalty)'; + return 'OnlineRecognizerConfig(feat: $feat, model: $model, decodingMethod: $decodingMethod, maxActivePaths: $maxActivePaths, enableEndpoint: $enableEndpoint, rule1MinTrailingSilence: $rule1MinTrailingSilence, rule2MinTrailingSilence: $rule2MinTrailingSilence, rule3MinUtteranceLength: $rule3MinUtteranceLength, hotwordsFile: $hotwordsFile, hotwordsScore: $hotwordsScore, ctcFstDecoderConfig: $ctcFstDecoderConfig, ruleFsts: $ruleFsts, ruleFars: $ruleFars, blankPenalty: $blankPenalty, hr: $hr)'; } Map toJson() => { @@ -240,6 +244,7 @@ class OnlineRecognizerConfig { 'ruleFsts': ruleFsts, 'ruleFars': ruleFars, 'blankPenalty': blankPenalty, + 'hr': hr.toJson(), }; final FeatureConfig feat; @@ -265,6 +270,7 @@ class OnlineRecognizerConfig { final String ruleFars; final double blankPenalty; + final HomophoneReplacerConfig hr; } class OnlineRecognizerResult { @@ -352,8 +358,15 @@ class OnlineRecognizer { c.ref.blankPenalty = config.blankPenalty; + c.ref.hr.dictDir = config.hr.dictDir.toNativeUtf8(); + c.ref.hr.lexicon = config.hr.lexicon.toNativeUtf8(); + c.ref.hr.ruleFsts = config.hr.ruleFsts.toNativeUtf8(); + final ptr = SherpaOnnxBindings.createOnlineRecognizer?.call(c) ?? nullptr; + calloc.free(c.ref.hr.dictDir); + calloc.free(c.ref.hr.lexicon); + calloc.free(c.ref.hr.ruleFsts); calloc.free(c.ref.ruleFars); calloc.free(c.ref.ruleFsts); calloc.free(c.ref.ctcFstDecoderConfig.graph); diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart index 52e26307..f33891ee 100644 --- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart +++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart @@ -353,6 +353,7 @@ final class SherpaOnnxOfflineRecognizerConfig extends Struct { @Float() external double blankPenalty; + external SherpaOnnxHomophoneReplacerConfig hr; } final class SherpaOnnxOnlineTransducerModelConfig extends Struct { @@ -404,6 +405,12 @@ final class SherpaOnnxOnlineCtcFstDecoderConfig extends Struct { external int maxActive; } +final class SherpaOnnxHomophoneReplacerConfig extends Struct { + external Pointer dictDir; + external Pointer lexicon; + external Pointer ruleFsts; +} + final class SherpaOnnxOnlineRecognizerConfig extends Struct { external SherpaOnnxFeatureConfig feat; external SherpaOnnxOnlineModelConfig model; @@ -441,6 +448,7 @@ final class SherpaOnnxOnlineRecognizerConfig extends Struct { @Int32() external int hotwordsBufSize; + external SherpaOnnxHomophoneReplacerConfig hr; } final class SherpaOnnxSileroVadModelConfig extends Struct {