Add TTS example for Java API. (#1176)
It plays the generated audio as it is still generating.
This commit is contained in:
186
java-api-examples/NonStreamingTtsPiperEnWithCallback.java
Normal file
186
java-api-examples/NonStreamingTtsPiperEnWithCallback.java
Normal file
@@ -0,0 +1,186 @@
|
|||||||
|
// Copyright 2024 Xiaomi Corporation
|
||||||
|
//
|
||||||
|
// References
|
||||||
|
// https://www.baeldung.com/java-passing-method-parameter
|
||||||
|
// https://www.geeksforgeeks.org/how-to-create-a-thread-safe-queue-in-java/
|
||||||
|
// https://stackoverflow.com/questions/74077394/java-audio-how-to-continuously-write-bytes-to-an-audio-file-as-they-are-being-g
|
||||||
|
|
||||||
|
// This file shows how to use a piper VITS English TTS model
|
||||||
|
// to convert text to speech. You can pass a callback to the generation call,
|
||||||
|
// which is invoked whenever max_num_sentences sentences have been
|
||||||
|
// finished generation.
|
||||||
|
//
|
||||||
|
// The callback saves the generated samples into a queue, which are played
|
||||||
|
// by a separate thread.
|
||||||
|
|
||||||
|
import com.k2fsa.sherpa.onnx.*;
|
||||||
|
import java.util.Queue;
|
||||||
|
import java.util.concurrent.*;
|
||||||
|
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||||
|
import javax.sound.sampled.*;
|
||||||
|
|
||||||
|
public class NonStreamingTtsPiperEn {
|
||||||
|
public static void main(String[] args) {
|
||||||
|
// please visit
|
||||||
|
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
||||||
|
// to download model files
|
||||||
|
String model = "./vits-piper-en_GB-cori-medium/en_GB-cori-medium.onnx";
|
||||||
|
String tokens = "./vits-piper-en_GB-cori-medium/tokens.txt";
|
||||||
|
String dataDir = "./vits-piper-en_GB-cori-medium/espeak-ng-data";
|
||||||
|
String text =
|
||||||
|
"Today as always, men fall into two groups: slaves and free men. Whoever does not have"
|
||||||
|
+ " two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a"
|
||||||
|
+ " businessman, an official, or a scholar.";
|
||||||
|
|
||||||
|
OfflineTtsVitsModelConfig vitsModelConfig =
|
||||||
|
OfflineTtsVitsModelConfig.builder()
|
||||||
|
.setModel(model)
|
||||||
|
.setTokens(tokens)
|
||||||
|
.setDataDir(dataDir)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
OfflineTtsModelConfig modelConfig =
|
||||||
|
OfflineTtsModelConfig.builder()
|
||||||
|
.setVits(vitsModelConfig)
|
||||||
|
.setNumThreads(1)
|
||||||
|
.setDebug(true)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build();
|
||||||
|
OfflineTts tts = new OfflineTts(config);
|
||||||
|
|
||||||
|
Queue<byte[]> samplesQueue = new ConcurrentLinkedQueue<>();
|
||||||
|
|
||||||
|
Semaphore canPlaySem = new Semaphore(1);
|
||||||
|
try {
|
||||||
|
canPlaySem.acquire();
|
||||||
|
} catch (InterruptedException ex) {
|
||||||
|
System.out.println("Failed to acquire the play semaphore in the main thread");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Runnable playRuannable =
|
||||||
|
() -> {
|
||||||
|
try {
|
||||||
|
canPlaySem.acquire();
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
System.out.println("Failed to get canPlay semaphore in the play thread");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html
|
||||||
|
AudioFormat format =
|
||||||
|
new AudioFormat(
|
||||||
|
tts.getSampleRate(), // sampleRate
|
||||||
|
16, // sampleSizeInBits
|
||||||
|
1, // channels
|
||||||
|
true, // signed
|
||||||
|
false // bigEndian
|
||||||
|
);
|
||||||
|
DataLine.Info info = new DataLine.Info(SourceDataLine.class, format);
|
||||||
|
SourceDataLine line;
|
||||||
|
try {
|
||||||
|
line = (SourceDataLine) AudioSystem.getLine(info);
|
||||||
|
|
||||||
|
int bufferSizeInBytes = tts.getSampleRate(); // 0.5 seconds
|
||||||
|
line.open(format, bufferSizeInBytes);
|
||||||
|
} catch (LineUnavailableException ex) {
|
||||||
|
System.out.println("Failed to open a device for playing");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
line.start();
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
if (samplesQueue.isEmpty()) {
|
||||||
|
// Do nothing.
|
||||||
|
//
|
||||||
|
// If the generating speed is very slow, we can sleep
|
||||||
|
// for some time here to save some CPU.
|
||||||
|
} else {
|
||||||
|
byte[] samples = samplesQueue.poll();
|
||||||
|
if (samples.length == 1) {
|
||||||
|
// end of the generating
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
line.write(samples, 0, samples.length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
line.drain();
|
||||||
|
line.close();
|
||||||
|
};
|
||||||
|
|
||||||
|
Thread playThread = new Thread(playRuannable);
|
||||||
|
playThread.start();
|
||||||
|
|
||||||
|
int sid = 0;
|
||||||
|
float speed = 1.0f;
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
GeneratedAudio audio =
|
||||||
|
tts.generateWithCallback(
|
||||||
|
text,
|
||||||
|
sid,
|
||||||
|
speed,
|
||||||
|
(float[] samples) -> {
|
||||||
|
|
||||||
|
// we use a byte array to save int16 samples
|
||||||
|
byte[] samplesInt16 = new byte[samples.length * 2];
|
||||||
|
for (int i = 0; i < samples.length; ++i) {
|
||||||
|
float s = samples[i];
|
||||||
|
if (s > 1) {
|
||||||
|
s = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (s < -1) {
|
||||||
|
s = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
short t = (short) (s * 32767);
|
||||||
|
|
||||||
|
// we use little endian
|
||||||
|
samplesInt16[2 * i] = (byte) (t & 0xff);
|
||||||
|
samplesInt16[2 * i + 1] = (byte) ((t & 0xff00) >> 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
samplesQueue.add(samplesInt16);
|
||||||
|
|
||||||
|
canPlaySem.release();
|
||||||
|
|
||||||
|
// Note: You can play the samples.
|
||||||
|
// warning: You need to save a copy of samples since it is freed
|
||||||
|
// when this function returns
|
||||||
|
|
||||||
|
// return 1 to continue generation
|
||||||
|
// return 0 to stop generation
|
||||||
|
return 1;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Since a sample always has two bytes. We put a single byte
|
||||||
|
// into the queue to indicate that we have finished processing.
|
||||||
|
samplesQueue.add(new byte[1]);
|
||||||
|
|
||||||
|
long stop = System.currentTimeMillis();
|
||||||
|
|
||||||
|
float timeElapsedSeconds = (stop - start) / 1000.0f;
|
||||||
|
|
||||||
|
float audioDuration = audio.getSamples().length / (float) audio.getSampleRate();
|
||||||
|
float real_time_factor = timeElapsedSeconds / audioDuration;
|
||||||
|
|
||||||
|
try {
|
||||||
|
playThread.join();
|
||||||
|
} catch (InterruptedException ex) {
|
||||||
|
System.out.println("Failed to join the play thread");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
String waveFilename = "tts-piper-en.wav";
|
||||||
|
audio.save(waveFilename);
|
||||||
|
System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
|
||||||
|
System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
|
||||||
|
System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
|
||||||
|
System.out.printf("-- text: %s\n", text);
|
||||||
|
System.out.printf("-- Saved to %s\n", waveFilename);
|
||||||
|
|
||||||
|
tts.release();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -24,6 +24,7 @@ This directory contains examples for the JAVA API of sherpa-onnx.
|
|||||||
./run-non-streaming-decode-file-nemo.sh
|
./run-non-streaming-decode-file-nemo.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
## Non-Streaming text-to-speech
|
## Non-Streaming text-to-speech
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -32,6 +33,12 @@ This directory contains examples for the JAVA API of sherpa-onnx.
|
|||||||
./run-non-streaming-tts-vits-zh.sh
|
./run-non-streaming-tts-vits-zh.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Non-Streaming text-to-speech (Play as it is generating)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./run-non-streaming-tts-piper-en-with-callback.sh
|
||||||
|
```
|
||||||
|
|
||||||
## Spoken language identification
|
## Spoken language identification
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
40
java-api-examples/run-non-streaming-tts-piper-en-with-callback.sh
Executable file
40
java-api-examples/run-non-streaming-tts-piper-en-with-callback.sh
Executable file
@@ -0,0 +1,40 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
|
||||||
|
mkdir -p ../build
|
||||||
|
pushd ../build
|
||||||
|
cmake \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||||
|
-DBUILD_SHARED_LIBS=ON \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_JNI=ON \
|
||||||
|
..
|
||||||
|
|
||||||
|
make -j4
|
||||||
|
ls -lh lib
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
|
||||||
|
pushd ../sherpa-onnx/java-api
|
||||||
|
make
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
# please visit
|
||||||
|
# https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
||||||
|
# to download more models
|
||||||
|
if [ ! -f ./vits-piper-en_GB-cori-medium/tokens.txt ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2
|
||||||
|
tar xf vits-piper-en_GB-cori-medium.tar.bz2
|
||||||
|
rm vits-piper-en_GB-cori-medium.tar.bz2
|
||||||
|
fi
|
||||||
|
|
||||||
|
java \
|
||||||
|
-Djava.library.path=$PWD/../build/lib \
|
||||||
|
-cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
|
||||||
|
NonStreamingTtsPiperEnWithCallback.java
|
||||||
@@ -38,6 +38,7 @@ java_files += OfflineTtsVitsModelConfig.java
|
|||||||
java_files += OfflineTtsModelConfig.java
|
java_files += OfflineTtsModelConfig.java
|
||||||
java_files += OfflineTtsConfig.java
|
java_files += OfflineTtsConfig.java
|
||||||
java_files += GeneratedAudio.java
|
java_files += GeneratedAudio.java
|
||||||
|
java_files += OfflineTtsCallback.java
|
||||||
java_files += OfflineTts.java
|
java_files += OfflineTts.java
|
||||||
|
|
||||||
java_files += SpokenLanguageIdentificationWhisperConfig.java
|
java_files += SpokenLanguageIdentificationWhisperConfig.java
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
package com.k2fsa.sherpa.onnx;
|
package com.k2fsa.sherpa.onnx;
|
||||||
|
|
||||||
|
|
||||||
public class OfflineTts {
|
public class OfflineTts {
|
||||||
static {
|
static {
|
||||||
System.loadLibrary("sherpa-onnx-jni");
|
System.loadLibrary("sherpa-onnx-jni");
|
||||||
@@ -13,6 +14,10 @@ public class OfflineTts {
|
|||||||
ptr = newFromFile(config);
|
ptr = newFromFile(config);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int getSampleRate() {
|
||||||
|
return getSampleRate(ptr);
|
||||||
|
}
|
||||||
|
|
||||||
public GeneratedAudio generate(String text) {
|
public GeneratedAudio generate(String text) {
|
||||||
return generate(text, 0, 1.0f);
|
return generate(text, 0, 1.0f);
|
||||||
}
|
}
|
||||||
@@ -28,6 +33,21 @@ public class OfflineTts {
|
|||||||
return new GeneratedAudio(samples, sampleRate);
|
return new GeneratedAudio(samples, sampleRate);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public GeneratedAudio generateWithCallback(String text, OfflineTtsCallback callback) {
|
||||||
|
return generateWithCallback(text, 0, 1.0f, callback);
|
||||||
|
}
|
||||||
|
|
||||||
|
public GeneratedAudio generateWithCallback(String text, int sid, OfflineTtsCallback callback) {
|
||||||
|
return generateWithCallback(text, sid, 1.0f, callback);
|
||||||
|
}
|
||||||
|
|
||||||
|
public GeneratedAudio generateWithCallback(String text, int sid, float speed, OfflineTtsCallback callback) {
|
||||||
|
Object[] arr = generateWithCallbackImpl(ptr, text, sid, speed, callback);
|
||||||
|
float[] samples = (float[]) arr[0];
|
||||||
|
int sampleRate = (int) arr[1];
|
||||||
|
return new GeneratedAudio(samples, sampleRate);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void finalize() throws Throwable {
|
protected void finalize() throws Throwable {
|
||||||
release();
|
release();
|
||||||
@@ -49,5 +69,7 @@ public class OfflineTts {
|
|||||||
|
|
||||||
private native Object[] generateImpl(long ptr, String text, int sid, float speed);
|
private native Object[] generateImpl(long ptr, String text, int sid, float speed);
|
||||||
|
|
||||||
|
private native Object[] generateWithCallbackImpl(long ptr, String text, int sid, float speed, OfflineTtsCallback callback);
|
||||||
|
|
||||||
private native long newFromFile(OfflineTtsConfig config);
|
private native long newFromFile(OfflineTtsConfig config);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,6 @@
|
|||||||
|
package com.k2fsa.sherpa.onnx;
|
||||||
|
|
||||||
|
@FunctionalInterface
|
||||||
|
public interface OfflineTtsCallback {
|
||||||
|
Integer invoke(float[] samples);
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user