Add TTS example for Java API. (#1176)

It plays the generated audio as it is still generating.
2024-07-28 12:07:19 +08:00
parent 4e6aeff07e
commit 1f8e575133
6 changed files with 262 additions and 0 deletions
--- a/java-api-examples/NonStreamingTtsPiperEnWithCallback.java
+++ b/java-api-examples/NonStreamingTtsPiperEnWithCallback.java
@@ -0,0 +1,186 @@
 // Copyright 2024 Xiaomi Corporation
 //
 // References
 // https://www.baeldung.com/java-passing-method-parameter
 // https://www.geeksforgeeks.org/how-to-create-a-thread-safe-queue-in-java/
 // https://stackoverflow.com/questions/74077394/java-audio-how-to-continuously-write-bytes-to-an-audio-file-as-they-are-being-g
 // This file shows how to use a piper VITS English TTS model
 // to convert text to speech. You can pass a callback to the generation call,
 // which is invoked whenever max_num_sentences sentences have been
 // finished generation.
 //
 // The callback saves the generated samples into a queue, which are played
 // by a separate thread.
 import com.k2fsa.sherpa.onnx.*;
 import java.util.Queue;
 import java.util.concurrent.*;
 import java.util.concurrent.ConcurrentLinkedQueue;
 import javax.sound.sampled.*;
 public class NonStreamingTtsPiperEn {
  public static void main(String[] args) {
    // please visit
    // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
    // to download model files
    String model = "./vits-piper-en_GB-cori-medium/en_GB-cori-medium.onnx";
    String tokens = "./vits-piper-en_GB-cori-medium/tokens.txt";
    String dataDir = "./vits-piper-en_GB-cori-medium/espeak-ng-data";
    String text =
        "Today as always, men fall into two groups: slaves and free men. Whoever does not have"
            + " two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a"
            + " businessman, an official, or a scholar.";
    OfflineTtsVitsModelConfig vitsModelConfig =
        OfflineTtsVitsModelConfig.builder()
            .setModel(model)
            .setTokens(tokens)
            .setDataDir(dataDir)
            .build();
    OfflineTtsModelConfig modelConfig =
        OfflineTtsModelConfig.builder()
            .setVits(vitsModelConfig)
            .setNumThreads(1)
            .setDebug(true)
            .build();
    OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build();
    OfflineTts tts = new OfflineTts(config);
    Queue<byte[]> samplesQueue = new ConcurrentLinkedQueue<>();
    Semaphore canPlaySem = new Semaphore(1);
    try {
      canPlaySem.acquire();
    } catch (InterruptedException ex) {
      System.out.println("Failed to acquire the play semaphore in the main thread");
      return;
    }
    Runnable playRuannable =
        () -> {
          try {
            canPlaySem.acquire();
          } catch (InterruptedException e) {
            System.out.println("Failed to get canPlay semaphore in the play thread");
            return;
          }
          // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html
          AudioFormat format =
              new AudioFormat(
                  tts.getSampleRate(), // sampleRate
                  16, // sampleSizeInBits
                  1, // channels
                  true, // signed
                  false // bigEndian
                  );
          DataLine.Info info = new DataLine.Info(SourceDataLine.class, format);
          SourceDataLine line;
          try {
            line = (SourceDataLine) AudioSystem.getLine(info);
            int bufferSizeInBytes = tts.getSampleRate(); // 0.5 seconds
            line.open(format, bufferSizeInBytes);
          } catch (LineUnavailableException ex) {
            System.out.println("Failed to open a device for playing");
            return;
          }
          line.start();
          while (true) {
            if (samplesQueue.isEmpty()) {
              // Do nothing.
              //
              // If the generating speed is very slow, we can sleep
              // for some time here to save some CPU.
            } else {
              byte[] samples = samplesQueue.poll();
              if (samples.length == 1) {
                // end of the generating
                break;
              }
              line.write(samples, 0, samples.length);
            }
          }
          line.drain();
          line.close();
        };
    Thread playThread = new Thread(playRuannable);
    playThread.start();
    int sid = 0;
    float speed = 1.0f;
    long start = System.currentTimeMillis();
    GeneratedAudio audio =
        tts.generateWithCallback(
            text,
            sid,
            speed,
            (float[] samples) -> {
              // we use a byte array to save int16 samples
              byte[] samplesInt16 = new byte[samples.length * 2];
              for (int i = 0; i < samples.length; ++i) {
                float s = samples[i];
                if (s > 1) {
                  s = 1;
                }
                if (s < -1) {
                  s = -1;
                }
                short t = (short) (s * 32767);
                // we use little endian
                samplesInt16[2 * i] = (byte) (t & 0xff);
                samplesInt16[2 * i + 1] = (byte) ((t & 0xff00) >> 8);
              }
              samplesQueue.add(samplesInt16);
              canPlaySem.release();
              // Note: You can play the samples.
              // warning: You need to save a copy of samples since it is freed
              // when this function returns
              // return 1 to continue generation
              // return 0 to stop generation
              return 1;
            });
    // Since a sample always has two bytes. We put a single byte
    // into the queue to indicate that we have finished processing.
    samplesQueue.add(new byte[1]);
    long stop = System.currentTimeMillis();
    float timeElapsedSeconds = (stop - start) / 1000.0f;
    float audioDuration = audio.getSamples().length / (float) audio.getSampleRate();
    float real_time_factor = timeElapsedSeconds / audioDuration;
    try {
      playThread.join();
    } catch (InterruptedException ex) {
      System.out.println("Failed to join the play thread");
      return;
    }
    String waveFilename = "tts-piper-en.wav";
    audio.save(waveFilename);
    System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
    System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
    System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
    System.out.printf("-- text: %s\n", text);
    System.out.printf("-- Saved to %s\n", waveFilename);
    tts.release();
  }
 }
--- a/java-api-examples/README.md
+++ b/java-api-examples/README.md
@@ -24,6 +24,7 @@ This directory contains examples for the JAVA API of sherpa-onnx.
 ./run-non-streaming-decode-file-nemo.sh
 ```
 ## Non-Streaming text-to-speech
 ```bash
@@ -32,6 +33,12 @@ This directory contains examples for the JAVA API of sherpa-onnx.
 ./run-non-streaming-tts-vits-zh.sh
 ```
 ## Non-Streaming text-to-speech (Play as it is generating)
 ```bash
 ./run-non-streaming-tts-piper-en-with-callback.sh
 ```
 ## Spoken language identification
 ```bash
--- a/java-api-examples/run-non-streaming-tts-piper-en-with-callback.sh
+++ b/java-api-examples/run-non-streaming-tts-piper-en-with-callback.sh
@@ -0,0 +1,40 @@
 #!/usr/bin/env bash
 set -ex
 if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib  && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  mkdir -p ../build
  pushd ../build
  cmake \
    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
    -DBUILD_SHARED_LIBS=ON \
    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
    -DSHERPA_ONNX_ENABLE_JNI=ON \
    ..
  make -j4
  ls -lh lib
  popd
 fi
 if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
  pushd ../sherpa-onnx/java-api
  make
  popd
 fi
 # please visit
 # https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
 # to download more models
 if [ ! -f ./vits-piper-en_GB-cori-medium/tokens.txt ]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2
  tar xf vits-piper-en_GB-cori-medium.tar.bz2
  rm vits-piper-en_GB-cori-medium.tar.bz2
 fi
 java \
  -Djava.library.path=$PWD/../build/lib \
  -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
  NonStreamingTtsPiperEnWithCallback.java
--- a/sherpa-onnx/java-api/Makefile
+++ b/sherpa-onnx/java-api/Makefile
@@ -38,6 +38,7 @@ java_files += OfflineTtsVitsModelConfig.java
 java_files += OfflineTtsModelConfig.java
 java_files += OfflineTtsConfig.java
 java_files += GeneratedAudio.java
 java_files += OfflineTtsCallback.java
 java_files += OfflineTts.java
 java_files += SpokenLanguageIdentificationWhisperConfig.java
--- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTts.java
+++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTts.java
@@ -2,6 +2,7 @@
 package com.k2fsa.sherpa.onnx;
 public class OfflineTts {
    static {
        System.loadLibrary("sherpa-onnx-jni");
@@ -13,6 +14,10 @@ public class OfflineTts {
        ptr = newFromFile(config);
    }
    public int getSampleRate() {
        return getSampleRate(ptr);
    }
    public GeneratedAudio generate(String text) {
        return generate(text, 0, 1.0f);
    }
@@ -28,6 +33,21 @@ public class OfflineTts {
        return new GeneratedAudio(samples, sampleRate);
    }
    public GeneratedAudio generateWithCallback(String text, OfflineTtsCallback callback) {
        return generateWithCallback(text, 0, 1.0f, callback);
    }
    public GeneratedAudio generateWithCallback(String text, int sid, OfflineTtsCallback callback) {
        return generateWithCallback(text, sid, 1.0f, callback);
    }
    public GeneratedAudio generateWithCallback(String text, int sid, float speed, OfflineTtsCallback callback) {
        Object[] arr = generateWithCallbackImpl(ptr, text, sid, speed, callback);
        float[] samples = (float[]) arr[0];
        int sampleRate = (int) arr[1];
        return new GeneratedAudio(samples, sampleRate);
    }
    @Override
    protected void finalize() throws Throwable {
        release();
@@ -49,5 +69,7 @@ public class OfflineTts {
    private native Object[] generateImpl(long ptr, String text, int sid, float speed);
    private native Object[] generateWithCallbackImpl(long ptr, String text, int sid, float speed, OfflineTtsCallback callback);
    private native long newFromFile(OfflineTtsConfig config);
 }
--- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java
+++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java
@@ -0,0 +1,6 @@
 package com.k2fsa.sherpa.onnx;
@FunctionalInterface
 public interface OfflineTtsCallback {
    Integer invoke(float[] samples);
 }