Add VAD + Non-streaming ASR + microphone examples for Java API (#1046)
This commit is contained in:
@@ -63,6 +63,18 @@ The punctuation model supports both English and Chinese.
|
|||||||
./run-vad-from-mic.sh
|
./run-vad-from-mic.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## VAD with a microphone + Non-streaming Paraformer for speech recognition
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./run-vad-from-mic-non-streaming-paraformer.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## VAD with a microphone + Non-streaming Whisper tiny.en for speech recognition
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./run-vad-from-mic-non-streaming-whisper.sh
|
||||||
|
```
|
||||||
|
|
||||||
## VAD (Remove silence)
|
## VAD (Remove silence)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
146
java-api-examples/VadFromMicWithNonStreamingParaformer.java
Normal file
146
java-api-examples/VadFromMicWithNonStreamingParaformer.java
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
// Copyright 2024 Xiaomi Corporation
|
||||||
|
|
||||||
|
// This file shows how to use a silero_vad model with a non-streaming Paraformer
|
||||||
|
// for speech recognition.
|
||||||
|
|
||||||
|
import com.k2fsa.sherpa.onnx.*;
|
||||||
|
import javax.sound.sampled.*;
|
||||||
|
|
||||||
|
public class VadFromMicWithNonStreamingParaformer {
|
||||||
|
private static final int sampleRate = 16000;
|
||||||
|
private static final int windowSize = 512;
|
||||||
|
|
||||||
|
public static Vad createVad() {
|
||||||
|
// please download ./silero_vad.onnx from
|
||||||
|
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||||
|
String model = "./silero_vad.onnx";
|
||||||
|
SileroVadModelConfig sileroVad =
|
||||||
|
SileroVadModelConfig.builder()
|
||||||
|
.setModel(model)
|
||||||
|
.setThreshold(0.5f)
|
||||||
|
.setMinSilenceDuration(0.25f)
|
||||||
|
.setMinSpeechDuration(0.5f)
|
||||||
|
.setWindowSize(windowSize)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
VadModelConfig config =
|
||||||
|
VadModelConfig.builder()
|
||||||
|
.setSileroVadModelConfig(sileroVad)
|
||||||
|
.setSampleRate(sampleRate)
|
||||||
|
.setNumThreads(1)
|
||||||
|
.setDebug(true)
|
||||||
|
.setProvider("cpu")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
return new Vad(config);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static OfflineRecognizer createOfflineRecognizer() {
|
||||||
|
// please refer to
|
||||||
|
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english
|
||||||
|
// to download model files
|
||||||
|
String model = "./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx";
|
||||||
|
String tokens = "./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt";
|
||||||
|
|
||||||
|
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst
|
||||||
|
String ruleFsts = "./itn_zh_number.fst";
|
||||||
|
|
||||||
|
OfflineParaformerModelConfig paraformer =
|
||||||
|
OfflineParaformerModelConfig.builder().setModel(model).build();
|
||||||
|
|
||||||
|
OfflineModelConfig modelConfig =
|
||||||
|
OfflineModelConfig.builder()
|
||||||
|
.setParaformer(paraformer)
|
||||||
|
.setTokens(tokens)
|
||||||
|
.setNumThreads(1)
|
||||||
|
.setDebug(true)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
OfflineRecognizerConfig config =
|
||||||
|
OfflineRecognizerConfig.builder()
|
||||||
|
.setOfflineModelConfig(modelConfig)
|
||||||
|
.setDecodingMethod("greedy_search")
|
||||||
|
.setRuleFsts(ruleFsts)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
return new OfflineRecognizer(config);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Vad vad = createVad();
|
||||||
|
OfflineRecognizer recognizer = createOfflineRecognizer();
|
||||||
|
|
||||||
|
// https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html
|
||||||
|
// Linear PCM, 16000Hz, 16-bit, 1 channel, signed, little endian
|
||||||
|
AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false);
|
||||||
|
|
||||||
|
// https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/DataLine.Info.html#Info-java.lang.Class-javax.sound.sampled.AudioFormat-int-
|
||||||
|
DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
|
||||||
|
TargetDataLine targetDataLine;
|
||||||
|
try {
|
||||||
|
targetDataLine = (TargetDataLine) AudioSystem.getLine(info);
|
||||||
|
targetDataLine.open(format);
|
||||||
|
targetDataLine.start();
|
||||||
|
} catch (LineUnavailableException e) {
|
||||||
|
System.out.println("Failed to open target data line: " + e.getMessage());
|
||||||
|
vad.release();
|
||||||
|
recognizer.release();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean printed = false;
|
||||||
|
byte[] buffer = new byte[windowSize * 2];
|
||||||
|
float[] samples = new float[windowSize];
|
||||||
|
|
||||||
|
System.out.println("Started. Please speak");
|
||||||
|
boolean running = true;
|
||||||
|
while (targetDataLine.isOpen() && running) {
|
||||||
|
int n = targetDataLine.read(buffer, 0, buffer.length);
|
||||||
|
if (n <= 0) {
|
||||||
|
System.out.printf("Got %d bytes. Expected %d bytes.\n", n, buffer.length);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (int i = 0; i != windowSize; ++i) {
|
||||||
|
short low = buffer[2 * i];
|
||||||
|
short high = buffer[2 * i + 1];
|
||||||
|
int s = (high << 8) + low;
|
||||||
|
samples[i] = (float) s / 32768;
|
||||||
|
}
|
||||||
|
|
||||||
|
vad.acceptWaveform(samples);
|
||||||
|
if (vad.isSpeechDetected() && !printed) {
|
||||||
|
System.out.println("Detected speech");
|
||||||
|
printed = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!vad.isSpeechDetected()) {
|
||||||
|
printed = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (!vad.empty()) {
|
||||||
|
SpeechSegment segment = vad.front();
|
||||||
|
float startTime = segment.getStart() / (float) sampleRate;
|
||||||
|
float duration = segment.getSamples().length / (float) sampleRate;
|
||||||
|
|
||||||
|
OfflineStream stream = recognizer.createStream();
|
||||||
|
stream.acceptWaveform(segment.getSamples(), sampleRate);
|
||||||
|
recognizer.decode(stream);
|
||||||
|
String text = recognizer.getResult(stream).getText();
|
||||||
|
stream.release();
|
||||||
|
|
||||||
|
if (!text.isEmpty()) {
|
||||||
|
System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (text.contains("退出程序")) {
|
||||||
|
running = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
vad.pop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
vad.release();
|
||||||
|
recognizer.release();
|
||||||
|
}
|
||||||
|
}
|
||||||
143
java-api-examples/VadFromMicWithNonStreamingWhisper.java
Normal file
143
java-api-examples/VadFromMicWithNonStreamingWhisper.java
Normal file
@@ -0,0 +1,143 @@
|
|||||||
|
// Copyright 2024 Xiaomi Corporation
|
||||||
|
|
||||||
|
// This file shows how to use a silero_vad model with a non-streaming Whisper tiny.en
|
||||||
|
// for speech recognition.
|
||||||
|
|
||||||
|
import com.k2fsa.sherpa.onnx.*;
|
||||||
|
import javax.sound.sampled.*;
|
||||||
|
|
||||||
|
public class VadFromMicNonStreamingWhisper {
|
||||||
|
private static final int sampleRate = 16000;
|
||||||
|
private static final int windowSize = 512;
|
||||||
|
|
||||||
|
public static Vad createVad() {
|
||||||
|
// please download ./silero_vad.onnx from
|
||||||
|
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||||
|
String model = "./silero_vad.onnx";
|
||||||
|
SileroVadModelConfig sileroVad =
|
||||||
|
SileroVadModelConfig.builder()
|
||||||
|
.setModel(model)
|
||||||
|
.setThreshold(0.5f)
|
||||||
|
.setMinSilenceDuration(0.25f)
|
||||||
|
.setMinSpeechDuration(0.5f)
|
||||||
|
.setWindowSize(windowSize)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
VadModelConfig config =
|
||||||
|
VadModelConfig.builder()
|
||||||
|
.setSileroVadModelConfig(sileroVad)
|
||||||
|
.setSampleRate(sampleRate)
|
||||||
|
.setNumThreads(1)
|
||||||
|
.setDebug(true)
|
||||||
|
.setProvider("cpu")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
return new Vad(config);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static OfflineRecognizer createOfflineRecognizer() {
|
||||||
|
// please refer to
|
||||||
|
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html
|
||||||
|
// to download model files
|
||||||
|
String encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx";
|
||||||
|
String decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx";
|
||||||
|
String tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt";
|
||||||
|
|
||||||
|
OfflineWhisperModelConfig whisper =
|
||||||
|
OfflineWhisperModelConfig.builder().setEncoder(encoder).setDecoder(decoder).build();
|
||||||
|
|
||||||
|
OfflineModelConfig modelConfig =
|
||||||
|
OfflineModelConfig.builder()
|
||||||
|
.setWhisper(whisper)
|
||||||
|
.setTokens(tokens)
|
||||||
|
.setNumThreads(1)
|
||||||
|
.setDebug(true)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
OfflineRecognizerConfig config =
|
||||||
|
OfflineRecognizerConfig.builder()
|
||||||
|
.setOfflineModelConfig(modelConfig)
|
||||||
|
.setDecodingMethod("greedy_search")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
return new OfflineRecognizer(config);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Vad vad = createVad();
|
||||||
|
OfflineRecognizer recognizer = createOfflineRecognizer();
|
||||||
|
|
||||||
|
// https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html
|
||||||
|
// Linear PCM, 16000Hz, 16-bit, 1 channel, signed, little endian
|
||||||
|
AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false);
|
||||||
|
|
||||||
|
// https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/DataLine.Info.html#Info-java.lang.Class-javax.sound.sampled.AudioFormat-int-
|
||||||
|
DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
|
||||||
|
TargetDataLine targetDataLine;
|
||||||
|
try {
|
||||||
|
targetDataLine = (TargetDataLine) AudioSystem.getLine(info);
|
||||||
|
targetDataLine.open(format);
|
||||||
|
targetDataLine.start();
|
||||||
|
} catch (LineUnavailableException e) {
|
||||||
|
System.out.println("Failed to open target data line: " + e.getMessage());
|
||||||
|
vad.release();
|
||||||
|
recognizer.release();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean printed = false;
|
||||||
|
byte[] buffer = new byte[windowSize * 2];
|
||||||
|
float[] samples = new float[windowSize];
|
||||||
|
|
||||||
|
System.out.println("Started. Please speak");
|
||||||
|
boolean running = true;
|
||||||
|
while (targetDataLine.isOpen() && running) {
|
||||||
|
int n = targetDataLine.read(buffer, 0, buffer.length);
|
||||||
|
if (n <= 0) {
|
||||||
|
System.out.printf("Got %d bytes. Expected %d bytes.\n", n, buffer.length);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (int i = 0; i != windowSize; ++i) {
|
||||||
|
short low = buffer[2 * i];
|
||||||
|
short high = buffer[2 * i + 1];
|
||||||
|
int s = (high << 8) + low;
|
||||||
|
samples[i] = (float) s / 32768;
|
||||||
|
}
|
||||||
|
|
||||||
|
vad.acceptWaveform(samples);
|
||||||
|
if (vad.isSpeechDetected() && !printed) {
|
||||||
|
System.out.println("Detected speech");
|
||||||
|
printed = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!vad.isSpeechDetected()) {
|
||||||
|
printed = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (!vad.empty()) {
|
||||||
|
SpeechSegment segment = vad.front();
|
||||||
|
float startTime = segment.getStart() / (float) sampleRate;
|
||||||
|
float duration = segment.getSamples().length / (float) sampleRate;
|
||||||
|
|
||||||
|
OfflineStream stream = recognizer.createStream();
|
||||||
|
stream.acceptWaveform(segment.getSamples(), sampleRate);
|
||||||
|
recognizer.decode(stream);
|
||||||
|
String text = recognizer.getResult(stream).getText();
|
||||||
|
stream.release();
|
||||||
|
|
||||||
|
if (!text.isEmpty()) {
|
||||||
|
System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (text.contains("exit the program")) {
|
||||||
|
running = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
vad.pop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
vad.release();
|
||||||
|
recognizer.release();
|
||||||
|
}
|
||||||
|
}
|
||||||
46
java-api-examples/run-vad-from-mic-non-streaming-paraformer.sh
Executable file
46
java-api-examples/run-vad-from-mic-non-streaming-paraformer.sh
Executable file
@@ -0,0 +1,46 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
|
||||||
|
mkdir -p ../build
|
||||||
|
pushd ../build
|
||||||
|
cmake \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||||
|
-DBUILD_SHARED_LIBS=ON \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_JNI=ON \
|
||||||
|
..
|
||||||
|
|
||||||
|
make -j4
|
||||||
|
ls -lh lib
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
|
||||||
|
pushd ../sherpa-onnx/java-api
|
||||||
|
make
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./silero_vad.onnx ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
|
||||||
|
|
||||||
|
tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
|
||||||
|
rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./itn_zh_number.fst ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst
|
||||||
|
fi
|
||||||
|
|
||||||
|
java \
|
||||||
|
-Djava.library.path=$PWD/../build/lib \
|
||||||
|
-cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
|
||||||
|
./VadFromMicWithNonStreamingParaformer.java
|
||||||
42
java-api-examples/run-vad-from-mic-non-streaming-whisper.sh
Executable file
42
java-api-examples/run-vad-from-mic-non-streaming-whisper.sh
Executable file
@@ -0,0 +1,42 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
|
||||||
|
mkdir -p ../build
|
||||||
|
pushd ../build
|
||||||
|
cmake \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||||
|
-DBUILD_SHARED_LIBS=ON \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_JNI=ON \
|
||||||
|
..
|
||||||
|
|
||||||
|
make -j4
|
||||||
|
ls -lh lib
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
|
||||||
|
pushd ../sherpa-onnx/java-api
|
||||||
|
make
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./silero_vad.onnx ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||||
|
|
||||||
|
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||||
|
rm sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||||
|
fi
|
||||||
|
|
||||||
|
java \
|
||||||
|
-Djava.library.path=$PWD/../build/lib \
|
||||||
|
-cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
|
||||||
|
./VadFromMicWithNonStreamingWhisper.java
|
||||||
@@ -1,183 +0,0 @@
|
|||||||
/*
|
|
||||||
* // Copyright 2022-2023 by zhaoming
|
|
||||||
*/
|
|
||||||
/*
|
|
||||||
Config modelconfig.cfg
|
|
||||||
sample_rate=16000
|
|
||||||
feature_dim=80
|
|
||||||
rule1_min_trailing_silence=2.4
|
|
||||||
rule2_min_trailing_silence=1.2
|
|
||||||
rule3_min_utterance_length=20
|
|
||||||
encoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx
|
|
||||||
decoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx
|
|
||||||
joiner=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx
|
|
||||||
tokens=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt
|
|
||||||
num_threads=4
|
|
||||||
enable_endpoint_detection=false
|
|
||||||
decoding_method=greedy_search
|
|
||||||
max_active_paths=4
|
|
||||||
*/
|
|
||||||
|
|
||||||
import com.k2fsa.sherpa.onnx.OnlineRecognizer;
|
|
||||||
import com.k2fsa.sherpa.onnx.OnlineStream;
|
|
||||||
import java.io.*;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
|
|
||||||
public class DecodeFile {
|
|
||||||
OnlineRecognizer rcgOjb;
|
|
||||||
OnlineStream streamObj;
|
|
||||||
String wavfilename;
|
|
||||||
|
|
||||||
public DecodeFile(String fileName) {
|
|
||||||
wavfilename = fileName;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void initModelWithPara() {
|
|
||||||
try {
|
|
||||||
String modelDir =
|
|
||||||
"/sherpa-onnx/build_old/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20";
|
|
||||||
String encoder = modelDir + "/encoder-epoch-99-avg-1.onnx";
|
|
||||||
String decoder = modelDir + "/decoder-epoch-99-avg-1.onnx";
|
|
||||||
String joiner = modelDir + "/joiner-epoch-99-avg-1.onnx";
|
|
||||||
String tokens = modelDir + "/tokens.txt";
|
|
||||||
int numThreads = 4;
|
|
||||||
int sampleRate = 16000;
|
|
||||||
int featureDim = 80;
|
|
||||||
boolean enableEndpointDetection = false;
|
|
||||||
float rule1MinTrailingSilence = 2.4F;
|
|
||||||
float rule2MinTrailingSilence = 1.2F;
|
|
||||||
float rule3MinUtteranceLength = 20F;
|
|
||||||
String decodingMethod = "greedy_search";
|
|
||||||
int maxActivePaths = 4;
|
|
||||||
String hotwordsFile = "";
|
|
||||||
float hotwordsScore = 1.5F;
|
|
||||||
String lm_model = "";
|
|
||||||
float lm_scale = 0.5F;
|
|
||||||
String modelType = "zipformer";
|
|
||||||
rcgOjb =
|
|
||||||
new OnlineRecognizer(
|
|
||||||
tokens,
|
|
||||||
encoder,
|
|
||||||
decoder,
|
|
||||||
joiner,
|
|
||||||
numThreads,
|
|
||||||
sampleRate,
|
|
||||||
featureDim,
|
|
||||||
enableEndpointDetection,
|
|
||||||
rule1MinTrailingSilence,
|
|
||||||
rule2MinTrailingSilence,
|
|
||||||
rule3MinUtteranceLength,
|
|
||||||
decodingMethod,
|
|
||||||
lm_model,
|
|
||||||
lm_scale,
|
|
||||||
maxActivePaths,
|
|
||||||
hotwordsFile,
|
|
||||||
hotwordsScore,
|
|
||||||
modelType);
|
|
||||||
streamObj = rcgOjb.createStream();
|
|
||||||
} catch (Exception e) {
|
|
||||||
System.err.println(e);
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void initModelWithCfg(String cfgFile) {
|
|
||||||
try {
|
|
||||||
// you should set setCfgPath() before running this
|
|
||||||
rcgOjb = new OnlineRecognizer(cfgFile);
|
|
||||||
streamObj = rcgOjb.createStream();
|
|
||||||
} catch (Exception e) {
|
|
||||||
System.err.println(e);
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void simpleExample() {
|
|
||||||
try {
|
|
||||||
float[] buffer = rcgOjb.readWavFile(wavfilename); // read data from file
|
|
||||||
streamObj.acceptWaveform(buffer); // feed stream with data
|
|
||||||
streamObj.inputFinished(); // tell engine you done with all data
|
|
||||||
OnlineStream ssObj[] = new OnlineStream[1];
|
|
||||||
while (rcgOjb.isReady(streamObj)) { // engine is ready for unprocessed data
|
|
||||||
ssObj[0] = streamObj;
|
|
||||||
rcgOjb.decodeStreams(ssObj); // decode for multiple stream
|
|
||||||
// rcgOjb.DecodeStream(streamObj); // decode for single stream
|
|
||||||
}
|
|
||||||
|
|
||||||
String recText = "simple:" + rcgOjb.getResult(streamObj) + "\n";
|
|
||||||
byte[] utf8Data = recText.getBytes(StandardCharsets.UTF_8);
|
|
||||||
System.out.println(new String(utf8Data));
|
|
||||||
rcgOjb.reSet(streamObj);
|
|
||||||
rcgOjb.releaseStream(streamObj); // release stream
|
|
||||||
rcgOjb.release(); // release recognizer
|
|
||||||
|
|
||||||
} catch (Exception e) {
|
|
||||||
System.err.println(e);
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void streamExample() {
|
|
||||||
try {
|
|
||||||
float[] buffer = rcgOjb.readWavFile(wavfilename); // read data from file
|
|
||||||
float[] chunk = new float[1600]; // //each time read 1600(0.1s) data
|
|
||||||
int chunkIndex = 0;
|
|
||||||
for (int i = 0; i < buffer.length; i++) // total wav length loop
|
|
||||||
{
|
|
||||||
chunk[chunkIndex] = buffer[i];
|
|
||||||
chunkIndex++;
|
|
||||||
if (chunkIndex >= 1600 || i == (buffer.length - 1)) {
|
|
||||||
chunkIndex = 0;
|
|
||||||
streamObj.acceptWaveform(chunk); // feed chunk
|
|
||||||
if (rcgOjb.isReady(streamObj)) {
|
|
||||||
rcgOjb.decodeStream(streamObj);
|
|
||||||
}
|
|
||||||
String testDate = rcgOjb.getResult(streamObj);
|
|
||||||
byte[] utf8Data = testDate.getBytes(StandardCharsets.UTF_8);
|
|
||||||
|
|
||||||
if (utf8Data.length > 0) {
|
|
||||||
System.out.println(Float.valueOf((float) i / 16000) + ":" + new String(utf8Data));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
streamObj.inputFinished();
|
|
||||||
while (rcgOjb.isReady(streamObj)) {
|
|
||||||
rcgOjb.decodeStream(streamObj);
|
|
||||||
}
|
|
||||||
|
|
||||||
String recText = "stream:" + rcgOjb.getResult(streamObj) + "\n";
|
|
||||||
byte[] utf8Data = recText.getBytes(StandardCharsets.UTF_8);
|
|
||||||
System.out.println(new String(utf8Data));
|
|
||||||
rcgOjb.reSet(streamObj);
|
|
||||||
rcgOjb.releaseStream(streamObj); // release stream
|
|
||||||
rcgOjb.release(); // release recognizer
|
|
||||||
|
|
||||||
} catch (Exception e) {
|
|
||||||
System.err.println(e);
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String[] args) {
|
|
||||||
try {
|
|
||||||
String appDir = System.getProperty("user.dir");
|
|
||||||
System.out.println("appdir=" + appDir);
|
|
||||||
String fileName = appDir + "/" + args[0];
|
|
||||||
String cfgPath = appDir + "/modeltest.cfg";
|
|
||||||
String soPath = appDir + "/../build/lib/libsherpa-onnx-jni.so";
|
|
||||||
OnlineRecognizer.setSoPath(soPath);
|
|
||||||
DecodeFile rcgDemo = new DecodeFile(fileName);
|
|
||||||
|
|
||||||
// ***************** */
|
|
||||||
rcgDemo.initModelWithCfg(cfgPath);
|
|
||||||
rcgDemo.streamExample();
|
|
||||||
// **************** */
|
|
||||||
rcgDemo.initModelWithCfg(cfgPath);
|
|
||||||
rcgDemo.simpleExample();
|
|
||||||
|
|
||||||
} catch (Exception e) {
|
|
||||||
System.err.println(e);
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,223 +0,0 @@
|
|||||||
/*
|
|
||||||
* // Copyright 2022-2023 by zhaoming
|
|
||||||
*/
|
|
||||||
/*
|
|
||||||
Real-time speech recognition from a microphone with com.k2fsa.sherpa.onnx Java API
|
|
||||||
|
|
||||||
example for cfgFile modelconfig.cfg
|
|
||||||
sample_rate=16000
|
|
||||||
feature_dim=80
|
|
||||||
rule1_min_trailing_silence=2.4
|
|
||||||
rule2_min_trailing_silence=1.2
|
|
||||||
rule3_min_utterance_length=20
|
|
||||||
encoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx
|
|
||||||
decoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx
|
|
||||||
joiner=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx
|
|
||||||
tokens=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt
|
|
||||||
num_threads=4
|
|
||||||
enable_endpoint_detection=true
|
|
||||||
decoding_method=greedy_search
|
|
||||||
max_active_paths=4
|
|
||||||
|
|
||||||
*/
|
|
||||||
import com.k2fsa.sherpa.onnx.OnlineRecognizer;
|
|
||||||
import com.k2fsa.sherpa.onnx.OnlineStream;
|
|
||||||
import java.io.*;
|
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.ByteOrder;
|
|
||||||
import java.nio.ShortBuffer;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import javax.sound.sampled.AudioFormat;
|
|
||||||
import javax.sound.sampled.AudioSystem;
|
|
||||||
import javax.sound.sampled.DataLine;
|
|
||||||
import javax.sound.sampled.TargetDataLine;
|
|
||||||
|
|
||||||
/** Microphone Example */
|
|
||||||
public class DecodeMic {
|
|
||||||
MicRcgThread micRcgThread = null; // thread handle
|
|
||||||
|
|
||||||
OnlineRecognizer rcgOjb; // the recognizer
|
|
||||||
|
|
||||||
OnlineStream streamObj; // the stream
|
|
||||||
|
|
||||||
public DecodeMic() {
|
|
||||||
|
|
||||||
micRcgThread = new MicRcgThread(); // create a new instance for MicRcgThread
|
|
||||||
}
|
|
||||||
|
|
||||||
public void open() {
|
|
||||||
micRcgThread.start(); // start to capture microphone data
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() {
|
|
||||||
micRcgThread.stop(); // close capture
|
|
||||||
}
|
|
||||||
|
|
||||||
/** init asr engine with config file */
|
|
||||||
public void initModelWithCfg(String cfgFile) {
|
|
||||||
try {
|
|
||||||
|
|
||||||
// set setSoPath() before running this
|
|
||||||
rcgOjb = new OnlineRecognizer(cfgFile);
|
|
||||||
|
|
||||||
streamObj = rcgOjb.createStream(); // create a stream for asr engine to feed data
|
|
||||||
} catch (Exception e) {
|
|
||||||
System.err.println(e);
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** read data from mic and feed to asr engine */
|
|
||||||
class MicRcgThread implements Runnable {
|
|
||||||
|
|
||||||
TargetDataLine capline; // line for capture mic data
|
|
||||||
|
|
||||||
Thread thread; // this thread
|
|
||||||
int segmentId = 0; // record the segment id when detect endpoint
|
|
||||||
String preText = ""; // decoded text
|
|
||||||
|
|
||||||
public MicRcgThread() {}
|
|
||||||
|
|
||||||
public void start() {
|
|
||||||
|
|
||||||
thread = new Thread(this);
|
|
||||||
|
|
||||||
thread.start(); // start thread
|
|
||||||
}
|
|
||||||
|
|
||||||
public void stop() {
|
|
||||||
capline.stop();
|
|
||||||
capline.close();
|
|
||||||
capline = null;
|
|
||||||
thread = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** feed captured microphone data to asr */
|
|
||||||
public void decodeSample(byte[] samplebytes) {
|
|
||||||
try {
|
|
||||||
ByteBuffer byteBuf = ByteBuffer.wrap(samplebytes); // create a bytebuf for samples
|
|
||||||
byteBuf.order(ByteOrder.LITTLE_ENDIAN); // set bytebuf to little endian
|
|
||||||
ShortBuffer shortBuf = byteBuf.asShortBuffer(); // covert to short type
|
|
||||||
short[] arrShort = new short[shortBuf.capacity()]; // array for copy short data
|
|
||||||
float[] arrFloat = new float[shortBuf.capacity()]; // array for copy float data
|
|
||||||
shortBuf.get(arrShort); // put date to arrShort
|
|
||||||
|
|
||||||
for (int i = 0; i < arrShort.length; i++) {
|
|
||||||
arrFloat[i] = arrShort[i] / 32768f; // loop to covert short data to float -1 to 1
|
|
||||||
}
|
|
||||||
streamObj.acceptWaveform(arrFloat); // feed asr engine with float data
|
|
||||||
while (rcgOjb.isReady(streamObj)) { // if engine is ready for unprocessed data
|
|
||||||
|
|
||||||
rcgOjb.decodeStream(streamObj); // decode for this stream
|
|
||||||
}
|
|
||||||
boolean isEndpoint =
|
|
||||||
rcgOjb.isEndpoint(
|
|
||||||
streamObj); // endpoint check, make sure enable_endpoint_detection=true in config
|
|
||||||
// file
|
|
||||||
String nowText = rcgOjb.getResult(streamObj); // get asr result
|
|
||||||
String recText = "";
|
|
||||||
byte[] utf8Data; // for covert text to utf8
|
|
||||||
if (isEndpoint && nowText.length() > 0) {
|
|
||||||
rcgOjb.reSet(streamObj); // reSet stream when detect endpoint
|
|
||||||
segmentId++;
|
|
||||||
preText = nowText;
|
|
||||||
recText = "text(seg_" + String.valueOf(segmentId) + "):" + nowText + "\n";
|
|
||||||
utf8Data = recText.getBytes(StandardCharsets.UTF_8);
|
|
||||||
System.out.println(new String(utf8Data));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!nowText.equals(preText)) { // if preText not equal nowtext
|
|
||||||
preText = nowText;
|
|
||||||
recText = nowText + "\n";
|
|
||||||
utf8Data = recText.getBytes(StandardCharsets.UTF_8);
|
|
||||||
System.out.println(new String(utf8Data));
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
System.err.println(e);
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** run mic capture thread */
|
|
||||||
public void run() {
|
|
||||||
System.out.println("Started! Please speak...");
|
|
||||||
|
|
||||||
AudioFormat.Encoding encoding = AudioFormat.Encoding.PCM_SIGNED; // the pcm format
|
|
||||||
float rate = 16000.0f; // using 16 kHz
|
|
||||||
int channels = 1; // single channel
|
|
||||||
int sampleSize = 16; // sampleSize 16bit
|
|
||||||
boolean isBigEndian = false; // using little endian
|
|
||||||
|
|
||||||
AudioFormat format =
|
|
||||||
new AudioFormat(
|
|
||||||
encoding, rate, sampleSize, channels, (sampleSize / 8) * channels, rate, isBigEndian);
|
|
||||||
|
|
||||||
DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
|
|
||||||
|
|
||||||
// check system support such data format
|
|
||||||
if (!AudioSystem.isLineSupported(info)) {
|
|
||||||
System.out.println(info + " not supported.");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// open a line for capture.
|
|
||||||
|
|
||||||
try {
|
|
||||||
capline = (TargetDataLine) AudioSystem.getLine(info);
|
|
||||||
capline.open(format, capline.getBufferSize());
|
|
||||||
} catch (Exception ex) {
|
|
||||||
System.out.println(ex);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// the buf size for mic captured each time
|
|
||||||
int bufferLengthInBytes = capline.getBufferSize() / 8 * format.getFrameSize();
|
|
||||||
byte[] micData = new byte[bufferLengthInBytes];
|
|
||||||
int numBytesRead;
|
|
||||||
|
|
||||||
capline.start(); // start to capture mic data
|
|
||||||
|
|
||||||
while (thread != null) {
|
|
||||||
// read data from line
|
|
||||||
if ((numBytesRead = capline.read(micData, 0, bufferLengthInBytes)) == -1) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
decodeSample(micData); // decode mic data
|
|
||||||
}
|
|
||||||
|
|
||||||
// stop and close
|
|
||||||
|
|
||||||
try {
|
|
||||||
if (capline != null) {
|
|
||||||
capline.stop();
|
|
||||||
capline.close();
|
|
||||||
capline = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (Exception ex) {
|
|
||||||
System.err.println(ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // End class DecodeMic
|
|
||||||
|
|
||||||
public static void main(String s[]) {
|
|
||||||
try {
|
|
||||||
String appDir = System.getProperty("user.dir");
|
|
||||||
System.out.println("appdir=" + appDir);
|
|
||||||
String cfgPath = appDir + "/modelconfig.cfg";
|
|
||||||
String soPath = appDir + "/../build/lib/libsherpa-onnx-jni.so";
|
|
||||||
OnlineRecognizer.setSoPath(soPath); // set so. lib for OnlineRecognizer
|
|
||||||
|
|
||||||
DecodeMic decodeEx = new DecodeMic();
|
|
||||||
decodeEx.initModelWithCfg(cfgPath); // init asr engine
|
|
||||||
decodeEx.open(); // open thread for mic
|
|
||||||
System.out.print("Press Enter to EXIT!\n");
|
|
||||||
char i = (char) System.in.read();
|
|
||||||
decodeEx.close();
|
|
||||||
} catch (Exception e) {
|
|
||||||
System.err.println(e);
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user