diff --git a/.github/workflows/run-java-test.yaml b/.github/workflows/run-java-test.yaml index 487cd8c7..93117d03 100644 --- a/.github/workflows/run-java-test.yaml +++ b/.github/workflows/run-java-test.yaml @@ -100,12 +100,32 @@ jobs: -DBUILD_SHARED_LIBS=ON \ -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ -DSHERPA_ONNX_ENABLE_BINARY=OFF \ + -DBUILD_ESPEAK_NG_EXE=OFF \ -DSHERPA_ONNX_ENABLE_JNI=ON \ .. make -j4 ls -lh lib + - name: Run java test (VAD + Non-streaming Paraformer) + shell: bash + run: | + cd ./java-api-examples + ./run-vad-non-streaming-paraformer.sh + rm *.onnx + ls -lh *.wav + rm *.wav + rm -rf sherpa-onnx-* + + - name: Run java test (VAD remove silence) + shell: bash + run: | + cd ./java-api-examples + ./run-vad-remove-slience.sh + rm *.onnx + ls -lh *.wav + rm *.wav + - name: Run java test (speaker identification) shell: bash run: | diff --git a/java-api-examples/README.md b/java-api-examples/README.md index e775994f..96973e15 100755 --- a/java-api-examples/README.md +++ b/java-api-examples/README.md @@ -56,3 +56,15 @@ The punctuation model supports both English and Chinese. ```bash ./run-speaker-identification.sh ``` + +## VAD (Remove silence) + +```bash +./run-vad-remove-slience.sh +``` + +## VAD + Non-streaming Paraformer for speech recognition + +```bash +./run-vad-non-streaming-paraformer.sh +``` diff --git a/java-api-examples/VadNonStreamingParaformer.java b/java-api-examples/VadNonStreamingParaformer.java new file mode 100644 index 00000000..be54d2d2 --- /dev/null +++ b/java-api-examples/VadNonStreamingParaformer.java @@ -0,0 +1,104 @@ +// Copyright 2024 Xiaomi Corporation + +// This file shows how to use a silero_vad model with a non-streaming Paraformer +// for speech recognition. + +import com.k2fsa.sherpa.onnx.*; +import java.util.Arrays; + +public class VadNonStreamingParaformer { + public static Vad createVad() { + // please download ./silero_vad.onnx from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + String model = "./silero_vad.onnx"; + SileroVadModelConfig sileroVad = + SileroVadModelConfig.builder() + .setModel(model) + .setThreshold(0.5f) + .setMinSilenceDuration(0.25f) + .setMinSpeechDuration(0.5f) + .setWindowSize(512) + .build(); + + VadModelConfig config = + VadModelConfig.builder() + .setSileroVadModelConfig(sileroVad) + .setSampleRate(16000) + .setNumThreads(1) + .setDebug(true) + .setProvider("cpu") + .build(); + + return new Vad(config); + } + + public static OfflineRecognizer createOfflineRecognizer() { + // please refer to + // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english + // to download model files + String model = "./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx"; + String tokens = "./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt"; + + String waveFilename = "./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/3-sichuan.wav"; + + WaveReader reader = new WaveReader(waveFilename); + + OfflineParaformerModelConfig paraformer = + OfflineParaformerModelConfig.builder().setModel(model).build(); + + OfflineModelConfig modelConfig = + OfflineModelConfig.builder() + .setParaformer(paraformer) + .setTokens(tokens) + .setNumThreads(1) + .setDebug(true) + .build(); + + OfflineRecognizerConfig config = + OfflineRecognizerConfig.builder() + .setOfflineModelConfig(modelConfig) + .setDecodingMethod("greedy_search") + .build(); + + return new OfflineRecognizer(config); + } + + public static void main(String[] args) { + + Vad vad = createVad(); + OfflineRecognizer recognizer = createOfflineRecognizer(); + + // You can download the test file from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + String testWaveFilename = "./lei-jun-test.wav"; + WaveReader reader = new WaveReader(testWaveFilename); + + int numSamples = reader.getSamples().length; + int numIter = numSamples / 512; + + for (int i = 0; i != numIter; ++i) { + int start = i * 512; + int end = start + 512; + float[] samples = Arrays.copyOfRange(reader.getSamples(), start, end); + vad.acceptWaveform(samples); + if (vad.isSpeechDetected()) { + while (!vad.empty()) { + SpeechSegment segment = vad.front(); + float startTime = segment.getStart() / 16000.0f; + float duration = segment.getSamples().length / 16000.0f; + + OfflineStream stream = recognizer.createStream(); + stream.acceptWaveform(segment.getSamples(), 16000); + recognizer.decode(stream); + String text = recognizer.getResult(stream).getText(); + + if (!text.isEmpty()) { + System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text); + } + + vad.pop(); + } + } + } + } +} diff --git a/java-api-examples/VadRemoveSilence.java b/java-api-examples/VadRemoveSilence.java new file mode 100644 index 00000000..2d5e48d9 --- /dev/null +++ b/java-api-examples/VadRemoveSilence.java @@ -0,0 +1,79 @@ +// Copyright 2024 Xiaomi Corporation + +// This file shows how to use a silero_vad model to remove silences from +// a wave file. + +import com.k2fsa.sherpa.onnx.*; +import java.util.ArrayList; +import java.util.Arrays; + +public class VadRemoveSilence { + public static void main(String[] args) { + // please download ./silero_vad.onnx from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + String model = "./silero_vad.onnx"; + SileroVadModelConfig sileroVad = + SileroVadModelConfig.builder() + .setModel(model) + .setThreshold(0.5f) + .setMinSilenceDuration(0.25f) + .setMinSpeechDuration(0.5f) + .setWindowSize(512) + .build(); + + VadModelConfig config = + VadModelConfig.builder() + .setSileroVadModelConfig(sileroVad) + .setSampleRate(16000) + .setNumThreads(1) + .setDebug(true) + .setProvider("cpu") + .build(); + + Vad vad = new Vad(config); + + // You can download the test file from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + String testWaveFilename = "./lei-jun-test.wav"; + WaveReader reader = new WaveReader(testWaveFilename); + + int numSamples = reader.getSamples().length; + int numIter = numSamples / 512; + + ArrayList segments = new ArrayList(); + + for (int i = 0; i != numIter; ++i) { + int start = i * 512; + int end = start + 512; + float[] samples = Arrays.copyOfRange(reader.getSamples(), start, end); + vad.acceptWaveform(samples); + if (vad.isSpeechDetected()) { + while (!vad.empty()) { + + // if you want to get the starting time of this segment, you can use + /* float startTime = vad.front().getStart() / 16000.0f; */ + + segments.add(vad.front().getSamples()); + vad.pop(); + } + } + } + + // get total number of samples + int n = 0; + for (float[] s : segments) { + n += s.length; + } + + float[] allSamples = new float[n]; + int i = 0; + for (float[] s : segments) { + System.arraycopy(s, 0, allSamples, i, s.length); + i += s.length; + } + + String outFilename = "lei-jun-test-no-silence.wav"; + WaveWriter.write(outFilename, allSamples, 16000); + System.out.printf("Saved to %s\n", outFilename); + } +} diff --git a/java-api-examples/run-vad-non-streaming-paraformer.sh b/java-api-examples/run-vad-non-streaming-paraformer.sh new file mode 100755 index 00000000..b3a04f0b --- /dev/null +++ b/java-api-examples/run-vad-non-streaming-paraformer.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +set -ex + +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then + mkdir -p ../build + pushd ../build + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=ON \ + .. + + make -j4 + ls -lh lib + popd +fi + +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then + pushd ../sherpa-onnx/java-api + make + popd +fi + +if [ ! -f ./silero_vad.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +fi + +if [ ! -f ./lei-jun-test.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +fi + +if [ ! -f ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + + tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +fi + +java \ + -Djava.library.path=$PWD/../build/lib \ + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ + ./VadNonStreamingParaformer.java diff --git a/java-api-examples/run-vad-remove-slience.sh b/java-api-examples/run-vad-remove-slience.sh new file mode 100755 index 00000000..0fd08ca6 --- /dev/null +++ b/java-api-examples/run-vad-remove-slience.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -ex + +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then + mkdir -p ../build + pushd ../build + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=ON \ + .. + + make -j4 + ls -lh lib + popd +fi + +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then + pushd ../sherpa-onnx/java-api + make + popd +fi + +if [ ! -f ./silero_vad.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +fi + +if [ ! -f ./lei-jun-test.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +fi + +java \ + -Djava.library.path=$PWD/../build/lib \ + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ + ./VadRemoveSilence.java diff --git a/sherpa-onnx/java-api/Makefile b/sherpa-onnx/java-api/Makefile index 61a652d2..18bbbcba 100644 --- a/sherpa-onnx/java-api/Makefile +++ b/sherpa-onnx/java-api/Makefile @@ -7,6 +7,7 @@ out_jar := $(out_dir)/sherpa-onnx.jar package_dir := com/k2fsa/sherpa/onnx java_files := WaveReader.java +java_files += WaveWriter.java java_files += EndpointRule.java java_files += EndpointConfig.java java_files += FeatureConfig.java @@ -56,6 +57,11 @@ java_files += SpeakerEmbeddingExtractorConfig.java java_files += SpeakerEmbeddingExtractor.java java_files += SpeakerEmbeddingManager.java +java_files += SileroVadModelConfig.java +java_files += VadModelConfig.java +java_files += SpeechSegment.java +java_files += Vad.java + class_files := $(java_files:%.java=%.class) java_files := $(addprefix src/$(package_dir)/,$(java_files)) diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SileroVadModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SileroVadModelConfig.java new file mode 100644 index 00000000..1cf019c0 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SileroVadModelConfig.java @@ -0,0 +1,81 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class SileroVadModelConfig { + private final String model; + private final float threshold; + private final float minSilenceDuration; + private final float minSpeechDuration; + private final int windowSize; + + private SileroVadModelConfig(Builder builder) { + this.model = builder.model; + this.threshold = builder.threshold; + this.minSilenceDuration = builder.minSilenceDuration; + this.minSpeechDuration = builder.minSpeechDuration; + this.windowSize = builder.windowSize; + } + + public static Builder builder() { + return new Builder(); + } + + public String getModel() { + return model; + } + + public float getThreshold() { + return threshold; + } + + public float getMinSilenceDuration() { + return minSilenceDuration; + } + + public float getMinSpeechDuration() { + return minSpeechDuration; + } + + public int getWindowSize() { + return windowSize; + } + + public static class Builder { + private String model = ""; + private float threshold = 0.5f; + private float minSilenceDuration = 0.25f; + private float minSpeechDuration = 0.5f; + private int windowSize = 512; + + public SileroVadModelConfig build() { + return new SileroVadModelConfig(this); + } + + + public Builder setModel(String model) { + this.model = model; + return this; + } + + public Builder setThreshold(float threshold) { + this.threshold = threshold; + return this; + } + + public Builder setMinSilenceDuration(float minSilenceDuration) { + this.minSilenceDuration = minSilenceDuration; + return this; + } + + public Builder setMinSpeechDuration(float minSpeechDuration) { + this.minSpeechDuration = minSpeechDuration; + return this; + } + + public Builder setWindowSize(int windowSize) { + this.windowSize = windowSize; + return this; + } + } +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SpeechSegment.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SpeechSegment.java new file mode 100644 index 00000000..e6aab4f2 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SpeechSegment.java @@ -0,0 +1,20 @@ +package com.k2fsa.sherpa.onnx; + +public class SpeechSegment { + + private final int start; + private final float[] samples; + + public SpeechSegment(int start, float[] samples) { + this.start = start; + this.samples = samples; + } + + public int getStart() { + return start; + } + + public float[] getSamples() { + return samples; + } +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java new file mode 100644 index 00000000..c0115e8b --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java @@ -0,0 +1,78 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class Vad { + static { + System.loadLibrary("sherpa-onnx-jni"); + } + + private long ptr = 0; + + public Vad(VadModelConfig config) { + ptr = newFromFile(config); + } + + @Override + protected void finalize() throws Throwable { + release(); + } + + public void release() { + if (this.ptr == 0) { + return; + } + delete(this.ptr); + this.ptr = 0; + } + + public void acceptWaveform(float[] samples) { + acceptWaveform(this.ptr, samples); + } + + public boolean empty() { + return empty(this.ptr); + } + + public void pop() { + pop(this.ptr); + } + + public void clear() { + clear(this.ptr); + } + + public void reset() { + reset(this.ptr); + } + + public SpeechSegment front() { + Object[] arr = front(this.ptr); + int start = (int) arr[0]; + float[] samples = (float[]) arr[1]; + + return new SpeechSegment(start, samples); + } + + public boolean isSpeechDetected() { + return isSpeechDetected(this.ptr); + } + + private native void delete(long ptr); + + private native long newFromFile(VadModelConfig config); + + private native void acceptWaveform(long ptr, float[] samples); + + private native boolean empty(long ptr); + + private native void pop(long ptr); + + private native void clear(long ptr); + + private native Object[] front(long ptr); + + private native boolean isSpeechDetected(long ptr); + + private native void reset(long ptr); +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/VadModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/VadModelConfig.java new file mode 100644 index 00000000..94ffcd26 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/VadModelConfig.java @@ -0,0 +1,80 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class VadModelConfig { + private final SileroVadModelConfig sileroVadModelConfig; + private final int sampleRate; + private final int numThreads; + private final boolean debug; + private final String provider; + + private VadModelConfig(Builder builder) { + this.sileroVadModelConfig = builder.sileroVadModelConfig; + this.sampleRate = builder.sampleRate; + this.numThreads = builder.numThreads; + this.debug = builder.debug; + this.provider = builder.provider; + } + + public static Builder builder() { + return new Builder(); + } + + public SileroVadModelConfig getSileroVadModelConfig() { + return sileroVadModelConfig; + } + + public int getSampleRate() { + return sampleRate; + } + + public int getNumThreads() { + return numThreads; + } + + public String getProvider() { + return provider; + } + + public boolean getDebug() { + return debug; + } + + public static class Builder { + private SileroVadModelConfig sileroVadModelConfig = new SileroVadModelConfig.Builder().build(); + private int sampleRate = 16000; + private int numThreads = 1; + private boolean debug = true; + private String provider = "cpu"; + + public VadModelConfig build() { + return new VadModelConfig(this); + } + + public Builder setSileroVadModelConfig(SileroVadModelConfig sileroVadModelConfig) { + this.sileroVadModelConfig = sileroVadModelConfig; + return this; + } + + public Builder setSampleRate(int sampleRate) { + this.sampleRate = sampleRate; + return this; + } + + public Builder setNumThreads(int numThreads) { + this.numThreads = numThreads; + return this; + } + + public Builder setDebug(boolean debug) { + this.debug = debug; + return this; + } + + public Builder setProvider(String provider) { + this.provider = provider; + return this; + } + } +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/WaveWriter.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/WaveWriter.java new file mode 100644 index 00000000..67efb0c0 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/WaveWriter.java @@ -0,0 +1,15 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class WaveWriter { + public WaveWriter() { + } + + public static boolean write(String filename, float[] samples, int sampleRate) { + WaveWriter w = new WaveWriter(); + return w.writeWaveToFile(filename, samples, sampleRate); + } + + private native boolean writeWaveToFile(String filename, float[] samples, int sampleRate); +} diff --git a/sherpa-onnx/jni/CMakeLists.txt b/sherpa-onnx/jni/CMakeLists.txt index eeed0930..64944b4a 100644 --- a/sherpa-onnx/jni/CMakeLists.txt +++ b/sherpa-onnx/jni/CMakeLists.txt @@ -24,6 +24,7 @@ set(sources spoken-language-identification.cc voice-activity-detector.cc wave-reader.cc + wave-writer.cc ) if(SHERPA_ONNX_ENABLE_TTS) diff --git a/sherpa-onnx/jni/wave-writer.cc b/sherpa-onnx/jni/wave-writer.cc new file mode 100644 index 00000000..93677f98 --- /dev/null +++ b/sherpa-onnx/jni/wave-writer.cc @@ -0,0 +1,23 @@ +// sherpa-onnx/jni/wave-writer.cc +// +// Copyright (c) 2024 Xiaomi Corporation +#include "sherpa-onnx/csrc/wave-writer.h" + +#include "sherpa-onnx/jni/common.h" + +SHERPA_ONNX_EXTERN_C +JNIEXPORT bool JNICALL Java_com_k2fsa_sherpa_onnx_WaveWriter_writeWaveToFile( + JNIEnv *env, jclass /*obj*/, jstring filename, jfloatArray samples, + jint sample_rate) { + jfloat *p = env->GetFloatArrayElements(samples, nullptr); + jsize n = env->GetArrayLength(samples); + + const char *p_filename = env->GetStringUTFChars(filename, nullptr); + + bool ok = sherpa_onnx::WriteWave(p_filename, sample_rate, p, n); + + env->ReleaseFloatArrayElements(samples, p, JNI_ABORT); + env->ReleaseStringUTFChars(filename, p_filename); + + return ok; +}