diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh index 2392ca00..27199ae9 100755 --- a/.github/scripts/test-dart.sh +++ b/.github/scripts/test-dart.sh @@ -7,6 +7,7 @@ cd dart-api-examples pushd tts echo '----------matcha tts----------' +./run-kokoro-zh-en.sh ./run-kokoro-en.sh ./run-matcha-zh.sh ./run-matcha-en.sh diff --git a/dart-api-examples/tts/bin/kokoro-zh-en.dart b/dart-api-examples/tts/bin/kokoro-zh-en.dart new file mode 100644 index 00000000..31ee4c49 --- /dev/null +++ b/dart-api-examples/tts/bin/kokoro-zh-en.dart @@ -0,0 +1,102 @@ +// Copyright (c) 2025 Xiaomi Corporation +import 'dart:io'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('model', help: 'Path to the onnx model') + ..addOption('voices', help: 'Path to the voices.bin') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption( + 'data-dir', + help: 'Path to espeak-ng-data directory', + defaultsTo: '', + ) + ..addOption( + 'dict-dir', + help: 'Path to dict directory', + defaultsTo: '', + ) + ..addOption( + 'lexicon', + help: 'Path to lexicon files', + defaultsTo: '', + ) + ..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '') + ..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '') + ..addOption('text', help: 'Text to generate TTS for') + ..addOption('output-wav', help: 'Filename to save the generated audio') + ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0') + ..addOption( + 'sid', + help: 'Speaker ID to select. Used only for multi-speaker TTS', + defaultsTo: '0', + ); + final res = parser.parse(arguments); + if (res['model'] == null || + res['voices'] == null || + res['tokens'] == null || + res['data-dir'] == null || + res['dict-dir'] == null || + res['lexicon'] == null || + res['output-wav'] == null || + res['text'] == null) { + print(parser.usage); + exit(1); + } + final model = res['model'] as String; + final voices = res['voices'] as String; + final tokens = res['tokens'] as String; + final dataDir = res['data-dir'] as String; + final dictDir = res['dict-dir'] as String; + final lexicon = res['lexicon'] as String; + final ruleFsts = res['rule-fsts'] as String; + final ruleFars = res['rule-fars'] as String; + final text = res['text'] as String; + final outputWav = res['output-wav'] as String; + var speed = double.tryParse(res['speed'] as String) ?? 1.0; + final sid = int.tryParse(res['sid'] as String) ?? 0; + + if (speed == 0) { + speed = 1.0; + } + + final kokoro = sherpa_onnx.OfflineTtsKokoroModelConfig( + model: model, + voices: voices, + tokens: tokens, + dataDir: dataDir, + lengthScale: 1 / speed, + dictDir: dictDir, + lexicon: lexicon, + ); + + final modelConfig = sherpa_onnx.OfflineTtsModelConfig( + kokoro: kokoro, + numThreads: 1, + debug: true, + ); + final config = sherpa_onnx.OfflineTtsConfig( + model: modelConfig, + maxNumSenetences: 1, + ruleFsts: ruleFsts, + ruleFars: ruleFars, + ); + + final tts = sherpa_onnx.OfflineTts(config); + final audio = tts.generate(text: text, sid: sid, speed: speed); + tts.free(); + + sherpa_onnx.writeWave( + filename: outputWav, + samples: audio.samples, + sampleRate: audio.sampleRate, + ); + print('Saved to $outputWav'); +} diff --git a/dart-api-examples/tts/run-kokoro-en.sh b/dart-api-examples/tts/run-kokoro-en.sh index 7db694e6..78e21a28 100755 --- a/dart-api-examples/tts/run-kokoro-en.sh +++ b/dart-api-examples/tts/run-kokoro-en.sh @@ -22,6 +22,6 @@ dart run \ --sid 9 \ --speed 1.0 \ --output-wav kokoro-en-9.wav \ - --text "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." \ + --text "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." ls -lh *.wav diff --git a/dart-api-examples/tts/run-kokoro-zh-en.sh b/dart-api-examples/tts/run-kokoro-zh-en.sh new file mode 100755 index 00000000..42e4851d --- /dev/null +++ b/dart-api-examples/tts/run-kokoro-zh-en.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html +# to download more models +if [ ! -f ./kokoro-multi-lang-v1_0/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2 + tar xf kokoro-multi-lang-v1_0.tar.bz2 + rm kokoro-multi-lang-v1_0.tar.bz2 +fi + +dart run \ + ./bin/kokoro-zh-en.dart \ + --model ./kokoro-multi-lang-v1_0/model.onnx \ + --voices ./kokoro-multi-lang-v1_0/voices.bin \ + --tokens ./kokoro-multi-lang-v1_0/tokens.txt \ + --data-dir ./kokoro-multi-lang-v1_0/espeak-ng-data \ + --dict-dir ./kokoro-multi-lang-v1_0/dict \ + --lexicon ./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --sid 45 \ + --speed 1.0 \ + --output-wav kokoro-zh-en-45.wav \ + --text "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?" + +ls -lh *.wav diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart index e544da95..c22c2a52 100644 --- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart +++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart @@ -155,6 +155,8 @@ final class SherpaOnnxOfflineTtsKokoroModelConfig extends Struct { @Float() external double lengthScale; + external Pointer dictDir; + external Pointer lexicon; } final class SherpaOnnxOfflineTtsModelConfig extends Struct { diff --git a/flutter/sherpa_onnx/lib/src/tts.dart b/flutter/sherpa_onnx/lib/src/tts.dart index fdaf8edc..e03126d0 100644 --- a/flutter/sherpa_onnx/lib/src/tts.dart +++ b/flutter/sherpa_onnx/lib/src/tts.dart @@ -67,11 +67,13 @@ class OfflineTtsKokoroModelConfig { this.tokens = '', this.dataDir = '', this.lengthScale = 1.0, + this.dictDir = '', + this.lexicon = '', }); @override String toString() { - return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale)'; + return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale, dictDir: $dictDir, lexicon: $lexicon)'; } final String model; @@ -79,6 +81,8 @@ class OfflineTtsKokoroModelConfig { final String tokens; final String dataDir; final double lengthScale; + final String dictDir; + final String lexicon; } class OfflineTtsModelConfig { @@ -166,6 +170,8 @@ class OfflineTts { c.ref.model.kokoro.tokens = config.model.kokoro.tokens.toNativeUtf8(); c.ref.model.kokoro.dataDir = config.model.kokoro.dataDir.toNativeUtf8(); c.ref.model.kokoro.lengthScale = config.model.kokoro.lengthScale; + c.ref.model.kokoro.dictDir = config.model.kokoro.dictDir.toNativeUtf8(); + c.ref.model.kokoro.lexicon = config.model.kokoro.lexicon.toNativeUtf8(); c.ref.model.numThreads = config.model.numThreads; c.ref.model.debug = config.model.debug ? 1 : 0; @@ -181,6 +187,8 @@ class OfflineTts { calloc.free(c.ref.ruleFsts); calloc.free(c.ref.model.provider); + calloc.free(c.ref.model.kokoro.lexicon); + calloc.free(c.ref.model.kokoro.dictDir); calloc.free(c.ref.model.kokoro.dataDir); calloc.free(c.ref.model.kokoro.tokens); calloc.free(c.ref.model.kokoro.voices);