Support Kokoro TTS for HarmonyOS. (#1743)

2025-01-22 11:14:42 +08:00
parent 5bcd7e100a
commit bc3322e5a6
5 changed files with 82 additions and 35 deletions
--- a/harmony-os/SherpaOnnxHar/sherpa_onnx/BuildProfile.ets
+++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/BuildProfile.ets
@@ -1,7 +1,7 @@
 /**
 * Use these variables when you tailor your ArkTS code. They must be of the const type.
 */
-export const HAR_VERSION = '1.10.37';
+export const HAR_VERSION = '1.10.40';
 export const BUILD_MODE_NAME = 'debug';
 export const DEBUG = true;
 export const TARGET_NAME = 'default';
--- a/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets
+++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets
@@ -31,7 +31,8 @@ export { OnlineStream,
  OnlineRecognizer,
 } from './src/main/ets/components/StreamingAsr';

-export { OfflineTtsMatchaModelConfig,
+export { OfflineTtsKokoroModelConfig,
+  OfflineTtsMatchaModelConfig,
  OfflineTtsVitsModelConfig,
  OfflineTtsModelConfig,
  OfflineTtsConfig,
--- a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingTts.ets
+++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingTts.ets
@@ -28,9 +28,18 @@ export class OfflineTtsMatchaModelConfig {
  public lengthScale: number = 1.0;
 }

+export class OfflineTtsKokoroModelConfig {
+  public model: string = '';
+  public voices: string = '';
+  public tokens: string = '';
+  public dataDir: string = '';
+  public lengthScale: number = 1.0;
+}
+
 export class OfflineTtsModelConfig {
  public vits: OfflineTtsVitsModelConfig = new OfflineTtsVitsModelConfig();
  public matcha: OfflineTtsMatchaModelConfig = new OfflineTtsMatchaModelConfig();
+  public kokoro: OfflineTtsKokoroModelConfig = new OfflineTtsKokoroModelConfig();
  public numThreads: number = 1;
  public debug: boolean = false;
  public provider: string = 'cpu';
--- a/harmony-os/SherpaOnnxTts/entry/src/main/ets/pages/Index.ets
+++ b/harmony-os/SherpaOnnxTts/entry/src/main/ets/pages/Index.ets
@@ -66,6 +66,7 @@ struct Index {
  @State initTtsDone: boolean = false;
  @State ttsGeneratedDone: boolean = true;
  @State numSpeakers: number = 1;
+  @State numThreads: number = 1;
  @State initAudioDone: boolean = false;
  private controller: TabsController = new TabsController();
  private cancelled: boolean = false;
@@ -135,6 +136,7 @@ struct Index {
        this.info = 'Model initialized!\nPlease enter text and press start.';
        this.sampleRate = e.data['sampleRate'] as number;
        this.numSpeakers = e.data['numSpeakers'] as number;
+        this.numThreads = e.data['numThreads'] as number;

        this.initTtsDone = true;
      }
@@ -177,6 +179,7 @@ struct Index {
            this.info = `Audio duration: ${audioDuration} s
 Elapsed: ${elapsedSeconds} s
 RTF = ${elapsedSeconds.toFixed(2)}/${audioDuration.toFixed(2)} = ${RTF.toFixed(3)}
+Number of threads: ${this.numThreads}
 `;
            if (this.cancelled) {
              this.info += '\nCancelled.';
--- a/harmony-os/SherpaOnnxTts/entry/src/main/ets/workers/NonStreamingTtsWorker.ets
+++ b/harmony-os/SherpaOnnxTts/entry/src/main/ets/workers/NonStreamingTtsWorker.ets
@@ -2,7 +2,7 @@ import worker, { ThreadWorkerGlobalScope, MessageEvents, ErrorEvent } from '@oho

 import { fileIo as fs } from '@kit.CoreFileKit';

-import {OfflineTtsConfig, OfflineTts, listRawfileDir, TtsInput, TtsOutput} from 'sherpa_onnx';
+import { OfflineTtsConfig, OfflineTts, listRawfileDir, TtsInput, TtsOutput } from 'sherpa_onnx';
 import { buffer } from '@kit.ArkTS';

 const workerPort: ThreadWorkerGlobalScope = worker.workerPort;
@@ -42,18 +42,22 @@ function copyRawFileDirToSandbox(context: Context, srcDir: string) {
  }
 }

-function copyRawFileToSandbox(context: Context, src: string, dst: string) {
-  // see https://blog.csdn.net/weixin_44640245/article/details/142634846
-  // https://developer.huawei.com/consumer/cn/doc/harmonyos-guides-V5/rawfile-guidelines-V5
+function copyRawFileToSandbox(context: Context, src: string,
+  dst: string) {
+  /* see
+   https://blog.csdn.net/weixin_44640245/article/details/142634846
+   https://developer.huawei.com/consumer/cn/doc/harmonyos-guides-V5/rawfile-guidelines-V5
+   */
  let uint8Array: Uint8Array = context.resourceManager.getRawFileContentSync(src);

  // https://developer.huawei.com/consumer/cn/doc/harmonyos-references-V5/js-apis-file-fs-V5#fsmkdir
  let sandboxPath: string = context.getApplicationContext().filesDir;
-  let filepath = sandboxPath + '/'  + dst;
+  let filepath = sandboxPath + '/' + dst;

  if (fs.accessSync(filepath)) {
-    // if the destination exists and has the expected file size,
-    // then we skip copying it
+    /* if the destination exists and has the expected file size
+       then we skip copying it
+     */
    let stat = fs.statSync(filepath);
    if (stat.size == uint8Array.length) {
      return;
@@ -66,11 +70,12 @@ function copyRawFileToSandbox(context: Context, src: string, dst: string) {
 }

 function initTts(context: Context): OfflineTts {
-  // Such a design is to make it easier to build flutter APPs with
-  // github actions for a variety of tts models
-  //
-  // See https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/flutter/generate-tts.py
-  // for details
+  /* Such a design is to make it easier to build flutter APPs with
+     github actions for a variety of tts models
+
+     See https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/flutter/generate-tts.py
+     for details
+   */

  let modelDir = '';

@@ -83,13 +88,19 @@ function initTts(context: Context): OfflineTts {
  let vocoder = '';
  // for Matcha end

+  // for Kokoro begin
+  let voices = '';
+  // for Kokoro end
+
  let ruleFsts = '';
  let ruleFars = '';
  let lexicon = '';
  let dataDir = '';
  let dictDir = '';
-  // You can select an example below and change it according to match your
-  // selected tts model
+  /*
+    You can select an example below and change it according to match your
+    selected tts model
+   */

  // ============================================================
  // Your change starts here
@@ -146,19 +157,26 @@ function initTts(context: Context): OfflineTts {
  // Example 8
  // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
-  // modelDir = 'matcha-icefall-zh-baker'
-  // acousticModelName = 'model-steps-3.onnx'
-  // vocoder = 'hifigan_v2.onnx'
-  // lexicon = 'lexicon.txt'
+  // modelDir = 'matcha-icefall-zh-baker';
+  // acousticModelName = 'model-steps-3.onnx';
+  // vocoder = 'hifigan_v2.onnx';
+  // lexicon = 'lexicon.txt';
  // dictDir = 'dict';
  // ruleFsts = `date.fst,phone.fst,number.fst`;

  // Example 9
  // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
-  // modelDir = 'matcha-icefall-en_US-ljspeech'
-  // acousticModelName = 'model-steps-3.onnx'
-  // vocoder = 'hifigan_v2.onnx'
+  // modelDir = 'matcha-icefall-en_US-ljspeech';
+  // acousticModelName = 'model-steps-3.onnx';
+  // vocoder = 'hifigan_v2.onnx';
+  // dataDir = 'espeak-ng-data';
+
+  // Example 10
+  // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html#kokoro-en-v0-19-english-11-speakers
+  // modelDir = 'kokoro-en-v0_19';
+  // modelName = 'model.onnx';
+  // voices = 'voices.bin'
  // dataDir = 'espeak-ng-data';

  // ============================================================
@@ -185,6 +203,10 @@ function initTts(context: Context): OfflineTts {
    acousticModelName = modelDir + '/' + acousticModelName;
  }

+  if (voices != '') {
+    voices = modelDir + '/' + voices;
+  }
+
  if (ruleFsts != '') {
    let fsts = ruleFsts.split(',')
    let tmp: string[] = [];
@@ -210,19 +232,24 @@ function initTts(context: Context): OfflineTts {
  if (dataDir != '') {
    copyRawFileDirToSandbox(context, modelDir + '/' + dataDir)
    let sandboxPath: string = context.getApplicationContext().filesDir;
-    dataDir = sandboxPath + '/' + modelDir + '/'  + dataDir;
+    dataDir = sandboxPath + '/' + modelDir + '/' + dataDir;
  }

  if (dictDir != '') {
    copyRawFileDirToSandbox(context, modelDir + '/' + dictDir)
    let sandboxPath: string = context.getApplicationContext().filesDir;
-    dictDir = sandboxPath + '/' + modelDir + '/'  + dictDir;
+    dictDir = sandboxPath + '/' + modelDir + '/' + dictDir;
  }

  const tokens = modelDir + '/tokens.txt';

  const config: OfflineTtsConfig = new OfflineTtsConfig();
-  config.model.vits.model = modelName;
+  if (voices != '') {
+    config.model.vits.model = '';
+  } else {
+    config.model.vits.model = modelName;
+  }
+
  config.model.vits.lexicon = lexicon;
  config.model.vits.tokens = tokens;
  config.model.vits.dataDir = dataDir;
@@ -235,6 +262,15 @@ function initTts(context: Context): OfflineTts {
  config.model.matcha.dataDir = dataDir;
  config.model.matcha.dictDir = dictDir;

+  if (voices != '') {
+    config.model.kokoro.model = modelName;
+  } else {
+    config.model.kokoro.model = '';
+  }
+  config.model.kokoro.voices = voices;
+  config.model.kokoro.tokens = tokens;
+  config.model.kokoro.dataDir = dataDir;
+
  config.model.numThreads = 2;
  config.model.debug = true;
  config.ruleFsts = ruleFsts;
@@ -250,14 +286,12 @@ interface TtsCallbackData {

 function callback(data: TtsCallbackData): number {
  workerPort.postMessage({
-    'msgType': 'tts-generate-partial',
-    samples: Float32Array.from(data.samples),
-    progress: data.progress,
+    'msgType': 'tts-generate-partial', samples: Float32Array.from(data.samples), progress: data.progress,
  });

  // 0 means to stop generating in C++
  // 1 means to continue generating in C++
-  return cancelled? 0 : 1;
+  return cancelled ? 0 : 1;
 }

 /**
@@ -272,9 +306,11 @@ workerPort.onmessage = (e: MessageEvents) => {
  if (msgType == 'init-tts' && !tts) {
    const context = e.data['context'] as Context;
    tts = initTts(context);
-    workerPort.postMessage({ 'msgType': 'init-tts-done',
+    workerPort.postMessage({
+      'msgType': 'init-tts-done',
      sampleRate: tts.sampleRate,
      numSpeakers: tts.numSpeakers,
+      numThreads: tts.config.model.numThreads,
    });
  }

@@ -297,16 +333,14 @@ workerPort.onmessage = (e: MessageEvents) => {
        console.log(`sampleRate: ${ttsOutput.sampleRate}`);

        workerPort.postMessage({
-          'msgType': 'tts-generate-done',
-          samples: Float32Array.from(ttsOutput.samples),
+          'msgType': 'tts-generate-done', samples: Float32Array.from(ttsOutput.samples),
        });

      });
    } else {
      const ttsOutput: TtsOutput = tts.generate(input);
      workerPort.postMessage({
-        'msgType': 'tts-generate-done',
-        samples: Float32Array.from(ttsOutput.samples),
+        'msgType': 'tts-generate-done', samples: Float32Array.from(ttsOutput.samples),
      });
    }