From 4335e2accdbde0fded9443591ff81853cb6d3544 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 16 Jan 2025 17:58:19 +0800 Subject: [PATCH] Add Dart API for Kokoro TTS models (#1723) --- .github/scripts/test-dart.sh | 1 + dart-api-examples/tts/bin/kokoro-en.dart | 86 +++++++++++++++++++ dart-api-examples/tts/run-kokoro-en.sh | 27 ++++++ .../lib/src/sherpa_onnx_bindings.dart | 11 +++ flutter/sherpa_onnx/lib/src/tts.dart | 38 +++++++- 5 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 dart-api-examples/tts/bin/kokoro-en.dart create mode 100755 dart-api-examples/tts/run-kokoro-en.sh diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh index 6ba74765..2392ca00 100755 --- a/.github/scripts/test-dart.sh +++ b/.github/scripts/test-dart.sh @@ -7,6 +7,7 @@ cd dart-api-examples pushd tts echo '----------matcha tts----------' +./run-kokoro-en.sh ./run-matcha-zh.sh ./run-matcha-en.sh ls -lh *.wav diff --git a/dart-api-examples/tts/bin/kokoro-en.dart b/dart-api-examples/tts/bin/kokoro-en.dart new file mode 100644 index 00000000..b92d9288 --- /dev/null +++ b/dart-api-examples/tts/bin/kokoro-en.dart @@ -0,0 +1,86 @@ +// Copyright (c) 2025 Xiaomi Corporation +import 'dart:io'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('model', help: 'Path to the onnx model') + ..addOption('voices', help: 'Path to the voices.bin') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption( + 'data-dir', + help: 'Path to espeak-ng-data directory', + defaultsTo: '', + ) + ..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '') + ..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '') + ..addOption('text', help: 'Text to generate TTS for') + ..addOption('output-wav', help: 'Filename to save the generated audio') + ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0') + ..addOption( + 'sid', + help: 'Speaker ID to select. Used only for multi-speaker TTS', + defaultsTo: '0', + ); + final res = parser.parse(arguments); + if (res['model'] == null || + res['voices'] == null || + res['tokens'] == null || + res['data-dir'] == null || + res['output-wav'] == null || + res['text'] == null) { + print(parser.usage); + exit(1); + } + final model = res['model'] as String; + final voices = res['voices'] as String; + final tokens = res['tokens'] as String; + final dataDir = res['data-dir'] as String; + final ruleFsts = res['rule-fsts'] as String; + final ruleFars = res['rule-fars'] as String; + final text = res['text'] as String; + final outputWav = res['output-wav'] as String; + var speed = double.tryParse(res['speed'] as String) ?? 1.0; + final sid = int.tryParse(res['sid'] as String) ?? 0; + + if (speed == 0) { + speed = 1.0; + } + + final kokoro = sherpa_onnx.OfflineTtsKokoroModelConfig( + model: model, + voices: voices, + tokens: tokens, + dataDir: dataDir, + lengthScale: 1 / speed, + ); + + final modelConfig = sherpa_onnx.OfflineTtsModelConfig( + kokoro: kokoro, + numThreads: 1, + debug: true, + ); + final config = sherpa_onnx.OfflineTtsConfig( + model: modelConfig, + maxNumSenetences: 1, + ruleFsts: ruleFsts, + ruleFars: ruleFars, + ); + + final tts = sherpa_onnx.OfflineTts(config); + final audio = tts.generate(text: text, sid: sid, speed: speed); + tts.free(); + + sherpa_onnx.writeWave( + filename: outputWav, + samples: audio.samples, + sampleRate: audio.sampleRate, + ); + print('Saved to $outputWav'); +} diff --git a/dart-api-examples/tts/run-kokoro-en.sh b/dart-api-examples/tts/run-kokoro-en.sh new file mode 100755 index 00000000..7db694e6 --- /dev/null +++ b/dart-api-examples/tts/run-kokoro-en.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html +# to download more models +if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 + tar xf kokoro-en-v0_19.tar.bz2 + rm kokoro-en-v0_19.tar.bz2 +fi + +dart run \ + ./bin/kokoro-en.dart \ + --model ./kokoro-en-v0_19/model.onnx \ + --voices ./kokoro-en-v0_19/voices.bin \ + --tokens ./kokoro-en-v0_19/tokens.txt \ + --data-dir ./kokoro-en-v0_19/espeak-ng-data \ + --sid 9 \ + --speed 1.0 \ + --output-wav kokoro-en-9.wav \ + --text "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." \ + +ls -lh *.wav diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart index 7baf53f2..1e41d091 100644 --- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart +++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart @@ -147,6 +147,16 @@ final class SherpaOnnxOfflineTtsMatchaModelConfig extends Struct { external Pointer dictDir; } +final class SherpaOnnxOfflineTtsKokoroModelConfig extends Struct { + external Pointer model; + external Pointer voices; + external Pointer tokens; + external Pointer dataDir; + + @Float() + external double lengthScale; +} + final class SherpaOnnxOfflineTtsModelConfig extends Struct { external SherpaOnnxOfflineTtsVitsModelConfig vits; @Int32() @@ -157,6 +167,7 @@ final class SherpaOnnxOfflineTtsModelConfig extends Struct { external Pointer provider; external SherpaOnnxOfflineTtsMatchaModelConfig matcha; + external SherpaOnnxOfflineTtsKokoroModelConfig kokoro; } final class SherpaOnnxOfflineTtsConfig extends Struct { diff --git a/flutter/sherpa_onnx/lib/src/tts.dart b/flutter/sherpa_onnx/lib/src/tts.dart index b5dcda48..fdaf8edc 100644 --- a/flutter/sherpa_onnx/lib/src/tts.dart +++ b/flutter/sherpa_onnx/lib/src/tts.dart @@ -60,10 +60,32 @@ class OfflineTtsMatchaModelConfig { final String dictDir; } +class OfflineTtsKokoroModelConfig { + const OfflineTtsKokoroModelConfig({ + this.model = '', + this.voices = '', + this.tokens = '', + this.dataDir = '', + this.lengthScale = 1.0, + }); + + @override + String toString() { + return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale)'; + } + + final String model; + final String voices; + final String tokens; + final String dataDir; + final double lengthScale; +} + class OfflineTtsModelConfig { const OfflineTtsModelConfig({ this.vits = const OfflineTtsVitsModelConfig(), this.matcha = const OfflineTtsMatchaModelConfig(), + this.kokoro = const OfflineTtsKokoroModelConfig(), this.numThreads = 1, this.debug = true, this.provider = 'cpu', @@ -71,11 +93,12 @@ class OfflineTtsModelConfig { @override String toString() { - return 'OfflineTtsModelConfig(vits: $vits, matcha: $matcha, numThreads: $numThreads, debug: $debug, provider: $provider)'; + return 'OfflineTtsModelConfig(vits: $vits, matcha: $matcha, kokoro: $kokoro, numThreads: $numThreads, debug: $debug, provider: $provider)'; } final OfflineTtsVitsModelConfig vits; final OfflineTtsMatchaModelConfig matcha; + final OfflineTtsKokoroModelConfig kokoro; final int numThreads; final bool debug; final String provider; @@ -138,6 +161,12 @@ class OfflineTts { c.ref.model.matcha.lengthScale = config.model.matcha.lengthScale; c.ref.model.matcha.dictDir = config.model.matcha.dictDir.toNativeUtf8(); + c.ref.model.kokoro.model = config.model.kokoro.model.toNativeUtf8(); + c.ref.model.kokoro.voices = config.model.kokoro.voices.toNativeUtf8(); + c.ref.model.kokoro.tokens = config.model.kokoro.tokens.toNativeUtf8(); + c.ref.model.kokoro.dataDir = config.model.kokoro.dataDir.toNativeUtf8(); + c.ref.model.kokoro.lengthScale = config.model.kokoro.lengthScale; + c.ref.model.numThreads = config.model.numThreads; c.ref.model.debug = config.model.debug ? 1 : 0; c.ref.model.provider = config.model.provider.toNativeUtf8(); @@ -151,12 +180,19 @@ class OfflineTts { calloc.free(c.ref.ruleFars); calloc.free(c.ref.ruleFsts); calloc.free(c.ref.model.provider); + + calloc.free(c.ref.model.kokoro.dataDir); + calloc.free(c.ref.model.kokoro.tokens); + calloc.free(c.ref.model.kokoro.voices); + calloc.free(c.ref.model.kokoro.model); + calloc.free(c.ref.model.matcha.dictDir); calloc.free(c.ref.model.matcha.dataDir); calloc.free(c.ref.model.matcha.tokens); calloc.free(c.ref.model.matcha.lexicon); calloc.free(c.ref.model.matcha.vocoder); calloc.free(c.ref.model.matcha.acousticModel); + calloc.free(c.ref.model.vits.dictDir); calloc.free(c.ref.model.vits.dataDir); calloc.free(c.ref.model.vits.tokens);