Add TTS example for Java API. (#1176)

It plays the generated audio as it is still generating.
2024-07-28 12:07:19 +08:00
parent 4e6aeff07e
commit 1f8e575133
6 changed files with 262 additions and 0 deletions
--- a/java-api-examples/NonStreamingTtsPiperEnWithCallback.java
+++ b/java-api-examples/NonStreamingTtsPiperEnWithCallback.java
@@ -0,0 +1,186 @@
+// Copyright 2024 Xiaomi Corporation
+//
+// References
+// https://www.baeldung.com/java-passing-method-parameter
+// https://www.geeksforgeeks.org/how-to-create-a-thread-safe-queue-in-java/
+// https://stackoverflow.com/questions/74077394/java-audio-how-to-continuously-write-bytes-to-an-audio-file-as-they-are-being-g
+
+// This file shows how to use a piper VITS English TTS model
+// to convert text to speech. You can pass a callback to the generation call,
+// which is invoked whenever max_num_sentences sentences have been
+// finished generation.
+//
+// The callback saves the generated samples into a queue, which are played
+// by a separate thread.
+
+import com.k2fsa.sherpa.onnx.*;
+import java.util.Queue;
+import java.util.concurrent.*;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import javax.sound.sampled.*;
+
+public class NonStreamingTtsPiperEn {
+  public static void main(String[] args) {
+    // please visit
+    // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
+    // to download model files
+    String model = "./vits-piper-en_GB-cori-medium/en_GB-cori-medium.onnx";
+    String tokens = "./vits-piper-en_GB-cori-medium/tokens.txt";
+    String dataDir = "./vits-piper-en_GB-cori-medium/espeak-ng-data";
+    String text =
+        "Today as always, men fall into two groups: slaves and free men. Whoever does not have"
+            + " two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a"
+            + " businessman, an official, or a scholar.";
+
+    OfflineTtsVitsModelConfig vitsModelConfig =
+        OfflineTtsVitsModelConfig.builder()
+            .setModel(model)
+            .setTokens(tokens)
+            .setDataDir(dataDir)
+            .build();
+
+    OfflineTtsModelConfig modelConfig =
+        OfflineTtsModelConfig.builder()
+            .setVits(vitsModelConfig)
+            .setNumThreads(1)
+            .setDebug(true)
+            .build();
+
+    OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build();
+    OfflineTts tts = new OfflineTts(config);
+
+    Queue<byte[]> samplesQueue = new ConcurrentLinkedQueue<>();
+
+    Semaphore canPlaySem = new Semaphore(1);
+    try {
+      canPlaySem.acquire();
+    } catch (InterruptedException ex) {
+      System.out.println("Failed to acquire the play semaphore in the main thread");
+      return;
+    }
+
+    Runnable playRuannable =
+        () -> {
+          try {
+            canPlaySem.acquire();
+          } catch (InterruptedException e) {
+            System.out.println("Failed to get canPlay semaphore in the play thread");
+            return;
+          }
+
+          // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html
+          AudioFormat format =
+              new AudioFormat(
+                  tts.getSampleRate(), // sampleRate
+                  16, // sampleSizeInBits
+                  1, // channels
+                  true, // signed
+                  false // bigEndian
+                  );
+          DataLine.Info info = new DataLine.Info(SourceDataLine.class, format);
+          SourceDataLine line;
+          try {
+            line = (SourceDataLine) AudioSystem.getLine(info);
+
+            int bufferSizeInBytes = tts.getSampleRate(); // 0.5 seconds
+            line.open(format, bufferSizeInBytes);
+          } catch (LineUnavailableException ex) {
+            System.out.println("Failed to open a device for playing");
+            return;
+          }
+          line.start();
+
+          while (true) {
+            if (samplesQueue.isEmpty()) {
+              // Do nothing.
+              //
+              // If the generating speed is very slow, we can sleep
+              // for some time here to save some CPU.
+            } else {
+              byte[] samples = samplesQueue.poll();
+              if (samples.length == 1) {
+                // end of the generating
+                break;
+              }
+              line.write(samples, 0, samples.length);
+            }
+          }
+
+          line.drain();
+          line.close();
+        };
+
+    Thread playThread = new Thread(playRuannable);
+    playThread.start();
+
+    int sid = 0;
+    float speed = 1.0f;
+    long start = System.currentTimeMillis();
+    GeneratedAudio audio =
+        tts.generateWithCallback(
+            text,
+            sid,
+            speed,
+            (float[] samples) -> {
+
+              // we use a byte array to save int16 samples
+              byte[] samplesInt16 = new byte[samples.length * 2];
+              for (int i = 0; i < samples.length; ++i) {
+                float s = samples[i];
+                if (s > 1) {
+                  s = 1;
+                }
+
+                if (s < -1) {
+                  s = -1;
+                }
+
+                short t = (short) (s * 32767);
+
+                // we use little endian
+                samplesInt16[2 * i] = (byte) (t & 0xff);
+                samplesInt16[2 * i + 1] = (byte) ((t & 0xff00) >> 8);
+              }
+
+              samplesQueue.add(samplesInt16);
+
+              canPlaySem.release();
+
+              // Note: You can play the samples.
+              // warning: You need to save a copy of samples since it is freed
+              // when this function returns
+
+              // return 1 to continue generation
+              // return 0 to stop generation
+              return 1;
+            });
+
+    // Since a sample always has two bytes. We put a single byte
+    // into the queue to indicate that we have finished processing.
+    samplesQueue.add(new byte[1]);
+
+    long stop = System.currentTimeMillis();
+
+    float timeElapsedSeconds = (stop - start) / 1000.0f;
+
+    float audioDuration = audio.getSamples().length / (float) audio.getSampleRate();
+    float real_time_factor = timeElapsedSeconds / audioDuration;
+
+    try {
+      playThread.join();
+    } catch (InterruptedException ex) {
+      System.out.println("Failed to join the play thread");
+      return;
+    }
+
+    String waveFilename = "tts-piper-en.wav";
+    audio.save(waveFilename);
+    System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
+    System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
+    System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
+    System.out.printf("-- text: %s\n", text);
+    System.out.printf("-- Saved to %s\n", waveFilename);
+
+    tts.release();
+  }
+}
--- a/java-api-examples/README.md
+++ b/java-api-examples/README.md
@@ -24,6 +24,7 @@ This directory contains examples for the JAVA API of sherpa-onnx.
 ./run-non-streaming-decode-file-nemo.sh
 ```

+
 ## Non-Streaming text-to-speech

 ```bash
@@ -32,6 +33,12 @@ This directory contains examples for the JAVA API of sherpa-onnx.
 ./run-non-streaming-tts-vits-zh.sh
 ```

+## Non-Streaming text-to-speech (Play as it is generating)
+
+```bash
+./run-non-streaming-tts-piper-en-with-callback.sh
+```
+
 ## Spoken language identification

 ```bash
--- a/java-api-examples/run-non-streaming-tts-piper-en-with-callback.sh
+++ b/java-api-examples/run-non-streaming-tts-piper-en-with-callback.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+set -ex
+
+if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib  && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
+  mkdir -p ../build
+  pushd ../build
+  cmake \
+    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+    -DBUILD_SHARED_LIBS=ON \
+    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+    -DSHERPA_ONNX_ENABLE_JNI=ON \
+    ..
+
+  make -j4
+  ls -lh lib
+  popd
+fi
+
+if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
+  pushd ../sherpa-onnx/java-api
+  make
+  popd
+fi
+
+# please visit
+# https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
+# to download more models
+if [ ! -f ./vits-piper-en_GB-cori-medium/tokens.txt ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2
+  tar xf vits-piper-en_GB-cori-medium.tar.bz2
+  rm vits-piper-en_GB-cori-medium.tar.bz2
+fi
+
+java \
+  -Djava.library.path=$PWD/../build/lib \
+  -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
+  NonStreamingTtsPiperEnWithCallback.java
--- a/sherpa-onnx/java-api/Makefile
+++ b/sherpa-onnx/java-api/Makefile
@@ -38,6 +38,7 @@ java_files += OfflineTtsVitsModelConfig.java
 java_files += OfflineTtsModelConfig.java
 java_files += OfflineTtsConfig.java
 java_files += GeneratedAudio.java
+java_files += OfflineTtsCallback.java
 java_files += OfflineTts.java

 java_files += SpokenLanguageIdentificationWhisperConfig.java
--- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTts.java
+++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTts.java
@@ -2,6 +2,7 @@

 package com.k2fsa.sherpa.onnx;

+
 public class OfflineTts {
    static {
        System.loadLibrary("sherpa-onnx-jni");
@@ -13,6 +14,10 @@ public class OfflineTts {
        ptr = newFromFile(config);
    }

+    public int getSampleRate() {
+        return getSampleRate(ptr);
+    }
+
    public GeneratedAudio generate(String text) {
        return generate(text, 0, 1.0f);
    }
@@ -28,6 +33,21 @@ public class OfflineTts {
        return new GeneratedAudio(samples, sampleRate);
    }

+    public GeneratedAudio generateWithCallback(String text, OfflineTtsCallback callback) {
+        return generateWithCallback(text, 0, 1.0f, callback);
+    }
+
+    public GeneratedAudio generateWithCallback(String text, int sid, OfflineTtsCallback callback) {
+        return generateWithCallback(text, sid, 1.0f, callback);
+    }
+
+    public GeneratedAudio generateWithCallback(String text, int sid, float speed, OfflineTtsCallback callback) {
+        Object[] arr = generateWithCallbackImpl(ptr, text, sid, speed, callback);
+        float[] samples = (float[]) arr[0];
+        int sampleRate = (int) arr[1];
+        return new GeneratedAudio(samples, sampleRate);
+    }
+
    @Override
    protected void finalize() throws Throwable {
        release();
@@ -49,5 +69,7 @@ public class OfflineTts {

    private native Object[] generateImpl(long ptr, String text, int sid, float speed);

+    private native Object[] generateWithCallbackImpl(long ptr, String text, int sid, float speed, OfflineTtsCallback callback);
+
    private native long newFromFile(OfflineTtsConfig config);
 }
--- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java
+++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java
@@ -0,0 +1,6 @@
+package com.k2fsa.sherpa.onnx;
+
+@FunctionalInterface
+public interface OfflineTtsCallback {
+    Integer invoke(float[] samples);
+}