From 1f8e57513351feb7e5b0dc5fdbca9a9258ff359f Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 28 Jul 2024 12:07:19 +0800 Subject: [PATCH] Add TTS example for Java API. (#1176) It plays the generated audio as it is still generating. --- .../NonStreamingTtsPiperEnWithCallback.java | 186 ++++++++++++++++++ java-api-examples/README.md | 7 + ...on-streaming-tts-piper-en-with-callback.sh | 40 ++++ sherpa-onnx/java-api/Makefile | 1 + .../src/com/k2fsa/sherpa/onnx/OfflineTts.java | 22 +++ .../k2fsa/sherpa/onnx/OfflineTtsCallback.java | 6 + 6 files changed, 262 insertions(+) create mode 100644 java-api-examples/NonStreamingTtsPiperEnWithCallback.java create mode 100755 java-api-examples/run-non-streaming-tts-piper-en-with-callback.sh create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java diff --git a/java-api-examples/NonStreamingTtsPiperEnWithCallback.java b/java-api-examples/NonStreamingTtsPiperEnWithCallback.java new file mode 100644 index 00000000..c4acae8c --- /dev/null +++ b/java-api-examples/NonStreamingTtsPiperEnWithCallback.java @@ -0,0 +1,186 @@ +// Copyright 2024 Xiaomi Corporation +// +// References +// https://www.baeldung.com/java-passing-method-parameter +// https://www.geeksforgeeks.org/how-to-create-a-thread-safe-queue-in-java/ +// https://stackoverflow.com/questions/74077394/java-audio-how-to-continuously-write-bytes-to-an-audio-file-as-they-are-being-g + +// This file shows how to use a piper VITS English TTS model +// to convert text to speech. You can pass a callback to the generation call, +// which is invoked whenever max_num_sentences sentences have been +// finished generation. +// +// The callback saves the generated samples into a queue, which are played +// by a separate thread. + +import com.k2fsa.sherpa.onnx.*; +import java.util.Queue; +import java.util.concurrent.*; +import java.util.concurrent.ConcurrentLinkedQueue; +import javax.sound.sampled.*; + +public class NonStreamingTtsPiperEn { + public static void main(String[] args) { + // please visit + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models + // to download model files + String model = "./vits-piper-en_GB-cori-medium/en_GB-cori-medium.onnx"; + String tokens = "./vits-piper-en_GB-cori-medium/tokens.txt"; + String dataDir = "./vits-piper-en_GB-cori-medium/espeak-ng-data"; + String text = + "Today as always, men fall into two groups: slaves and free men. Whoever does not have" + + " two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a" + + " businessman, an official, or a scholar."; + + OfflineTtsVitsModelConfig vitsModelConfig = + OfflineTtsVitsModelConfig.builder() + .setModel(model) + .setTokens(tokens) + .setDataDir(dataDir) + .build(); + + OfflineTtsModelConfig modelConfig = + OfflineTtsModelConfig.builder() + .setVits(vitsModelConfig) + .setNumThreads(1) + .setDebug(true) + .build(); + + OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build(); + OfflineTts tts = new OfflineTts(config); + + Queue samplesQueue = new ConcurrentLinkedQueue<>(); + + Semaphore canPlaySem = new Semaphore(1); + try { + canPlaySem.acquire(); + } catch (InterruptedException ex) { + System.out.println("Failed to acquire the play semaphore in the main thread"); + return; + } + + Runnable playRuannable = + () -> { + try { + canPlaySem.acquire(); + } catch (InterruptedException e) { + System.out.println("Failed to get canPlay semaphore in the play thread"); + return; + } + + // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html + AudioFormat format = + new AudioFormat( + tts.getSampleRate(), // sampleRate + 16, // sampleSizeInBits + 1, // channels + true, // signed + false // bigEndian + ); + DataLine.Info info = new DataLine.Info(SourceDataLine.class, format); + SourceDataLine line; + try { + line = (SourceDataLine) AudioSystem.getLine(info); + + int bufferSizeInBytes = tts.getSampleRate(); // 0.5 seconds + line.open(format, bufferSizeInBytes); + } catch (LineUnavailableException ex) { + System.out.println("Failed to open a device for playing"); + return; + } + line.start(); + + while (true) { + if (samplesQueue.isEmpty()) { + // Do nothing. + // + // If the generating speed is very slow, we can sleep + // for some time here to save some CPU. + } else { + byte[] samples = samplesQueue.poll(); + if (samples.length == 1) { + // end of the generating + break; + } + line.write(samples, 0, samples.length); + } + } + + line.drain(); + line.close(); + }; + + Thread playThread = new Thread(playRuannable); + playThread.start(); + + int sid = 0; + float speed = 1.0f; + long start = System.currentTimeMillis(); + GeneratedAudio audio = + tts.generateWithCallback( + text, + sid, + speed, + (float[] samples) -> { + + // we use a byte array to save int16 samples + byte[] samplesInt16 = new byte[samples.length * 2]; + for (int i = 0; i < samples.length; ++i) { + float s = samples[i]; + if (s > 1) { + s = 1; + } + + if (s < -1) { + s = -1; + } + + short t = (short) (s * 32767); + + // we use little endian + samplesInt16[2 * i] = (byte) (t & 0xff); + samplesInt16[2 * i + 1] = (byte) ((t & 0xff00) >> 8); + } + + samplesQueue.add(samplesInt16); + + canPlaySem.release(); + + // Note: You can play the samples. + // warning: You need to save a copy of samples since it is freed + // when this function returns + + // return 1 to continue generation + // return 0 to stop generation + return 1; + }); + + // Since a sample always has two bytes. We put a single byte + // into the queue to indicate that we have finished processing. + samplesQueue.add(new byte[1]); + + long stop = System.currentTimeMillis(); + + float timeElapsedSeconds = (stop - start) / 1000.0f; + + float audioDuration = audio.getSamples().length / (float) audio.getSampleRate(); + float real_time_factor = timeElapsedSeconds / audioDuration; + + try { + playThread.join(); + } catch (InterruptedException ex) { + System.out.println("Failed to join the play thread"); + return; + } + + String waveFilename = "tts-piper-en.wav"; + audio.save(waveFilename); + System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); + System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); + System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); + System.out.printf("-- text: %s\n", text); + System.out.printf("-- Saved to %s\n", waveFilename); + + tts.release(); + } +} diff --git a/java-api-examples/README.md b/java-api-examples/README.md index 3a4f9acc..697f0c87 100755 --- a/java-api-examples/README.md +++ b/java-api-examples/README.md @@ -24,6 +24,7 @@ This directory contains examples for the JAVA API of sherpa-onnx. ./run-non-streaming-decode-file-nemo.sh ``` + ## Non-Streaming text-to-speech ```bash @@ -32,6 +33,12 @@ This directory contains examples for the JAVA API of sherpa-onnx. ./run-non-streaming-tts-vits-zh.sh ``` +## Non-Streaming text-to-speech (Play as it is generating) + +```bash +./run-non-streaming-tts-piper-en-with-callback.sh +``` + ## Spoken language identification ```bash diff --git a/java-api-examples/run-non-streaming-tts-piper-en-with-callback.sh b/java-api-examples/run-non-streaming-tts-piper-en-with-callback.sh new file mode 100755 index 00000000..e9678c54 --- /dev/null +++ b/java-api-examples/run-non-streaming-tts-piper-en-with-callback.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +set -ex + +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then + mkdir -p ../build + pushd ../build + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=ON \ + .. + + make -j4 + ls -lh lib + popd +fi + +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then + pushd ../sherpa-onnx/java-api + make + popd +fi + +# please visit +# https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models +# to download more models +if [ ! -f ./vits-piper-en_GB-cori-medium/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2 + tar xf vits-piper-en_GB-cori-medium.tar.bz2 + rm vits-piper-en_GB-cori-medium.tar.bz2 +fi + +java \ + -Djava.library.path=$PWD/../build/lib \ + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ + NonStreamingTtsPiperEnWithCallback.java diff --git a/sherpa-onnx/java-api/Makefile b/sherpa-onnx/java-api/Makefile index 70d5fe20..69c3631b 100644 --- a/sherpa-onnx/java-api/Makefile +++ b/sherpa-onnx/java-api/Makefile @@ -38,6 +38,7 @@ java_files += OfflineTtsVitsModelConfig.java java_files += OfflineTtsModelConfig.java java_files += OfflineTtsConfig.java java_files += GeneratedAudio.java +java_files += OfflineTtsCallback.java java_files += OfflineTts.java java_files += SpokenLanguageIdentificationWhisperConfig.java diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTts.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTts.java index 7762692b..8c8b294a 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTts.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTts.java @@ -2,6 +2,7 @@ package com.k2fsa.sherpa.onnx; + public class OfflineTts { static { System.loadLibrary("sherpa-onnx-jni"); @@ -13,6 +14,10 @@ public class OfflineTts { ptr = newFromFile(config); } + public int getSampleRate() { + return getSampleRate(ptr); + } + public GeneratedAudio generate(String text) { return generate(text, 0, 1.0f); } @@ -28,6 +33,21 @@ public class OfflineTts { return new GeneratedAudio(samples, sampleRate); } + public GeneratedAudio generateWithCallback(String text, OfflineTtsCallback callback) { + return generateWithCallback(text, 0, 1.0f, callback); + } + + public GeneratedAudio generateWithCallback(String text, int sid, OfflineTtsCallback callback) { + return generateWithCallback(text, sid, 1.0f, callback); + } + + public GeneratedAudio generateWithCallback(String text, int sid, float speed, OfflineTtsCallback callback) { + Object[] arr = generateWithCallbackImpl(ptr, text, sid, speed, callback); + float[] samples = (float[]) arr[0]; + int sampleRate = (int) arr[1]; + return new GeneratedAudio(samples, sampleRate); + } + @Override protected void finalize() throws Throwable { release(); @@ -49,5 +69,7 @@ public class OfflineTts { private native Object[] generateImpl(long ptr, String text, int sid, float speed); + private native Object[] generateWithCallbackImpl(long ptr, String text, int sid, float speed, OfflineTtsCallback callback); + private native long newFromFile(OfflineTtsConfig config); } diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java new file mode 100644 index 00000000..396594a9 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java @@ -0,0 +1,6 @@ +package com.k2fsa.sherpa.onnx; + +@FunctionalInterface +public interface OfflineTtsCallback { + Integer invoke(float[] samples); +}