enginex_bi_series-sherpa-onnx/java-api-examples/VadFromMic.java

// Copyright 2024 Xiaomi Corporation

// This file shows how to use a silero_vad model to detect speech
// and save detected speech into a wave file.

import com.k2fsa.sherpa.onnx.*;
import javax.sound.sampled.*;

public class VadFromMic {
  public static void main(String[] args) {
    int sampleRate = 16000;
    int windowSize = 512;
    // please download ./silero_vad.onnx from
    // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
    String model = "./silero_vad.onnx";
    SileroVadModelConfig sileroVad =
        SileroVadModelConfig.builder()
            .setModel(model)
            .setThreshold(0.5f)
            .setMinSilenceDuration(0.25f)
            .setMinSpeechDuration(0.5f)
            .setWindowSize(windowSize)
            .build();

    VadModelConfig config =
        VadModelConfig.builder()
            .setSileroVadModelConfig(sileroVad)
            .setSampleRate(sampleRate)
            .setNumThreads(1)
            .setDebug(true)
            .setProvider("cpu")
            .build();

    Vad vad = new Vad(config);

    // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html
    // Linear PCM, 16000Hz, 16-bit, 1 channel, signed, little endian
    AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false);

    // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/DataLine.Info.html#Info-java.lang.Class-javax.sound.sampled.AudioFormat-int-
    DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
    TargetDataLine targetDataLine;
    try {
      targetDataLine = (TargetDataLine) AudioSystem.getLine(info);
      targetDataLine.open(format);
      targetDataLine.start();
    } catch (LineUnavailableException e) {
      System.out.println("Failed to open target data line: " + e.getMessage());
      vad.release();
      return;
    }

    boolean printed = false;
    int index = 0;

    byte[] buffer = new byte[windowSize * 2];
    float[] samples = new float[windowSize];

    while (targetDataLine.isOpen()) {
      int n = targetDataLine.read(buffer, 0, buffer.length);
      if (n <= 0) {
        System.out.printf("Got %d bytes. Expected %d bytes.\n", n, buffer.length);
        continue;
      }
      for (int i = 0; i != windowSize; ++i) {
        short low = buffer[2 * i];
        short high = buffer[2 * i + 1];
        int s = (high << 8) + low;
        samples[i] = (float) s / 32768;
      }

      vad.acceptWaveform(samples);
      if (vad.isSpeechDetected() && !printed) {
        System.out.println("Detected speech");
        printed = true;
      }

      if (!vad.isSpeechDetected()) {
        printed = false;
      }

      while (!vad.empty()) {
        float[] segment = vad.front().getSamples();
        float duration = segment.length / (float) sampleRate;
        System.out.printf("Duration: %.3f seconds\n", duration);

        String filename = String.format("seg-%d-%.3fs.wav", index, duration);
        index += 1;
        WaveWriter.write(filename, segment, sampleRate);
        System.out.printf("Saved to %s\n", filename);
        System.out.println("----------");
        vad.pop();
      }
    }

    vad.release();
  }
}
Add VAD + microphone example for Java API. (#1045) 2024-06-23 18:34:18 +08:00			`// Copyright 2024 Xiaomi Corporation`

			`// This file shows how to use a silero_vad model to detect speech`
			`// and save detected speech into a wave file.`

			`import com.k2fsa.sherpa.onnx.*;`
			`import javax.sound.sampled.*;`

			`public class VadFromMic {`
			`public static void main(String[] args) {`
			`int sampleRate = 16000;`
			`int windowSize = 512;`
			`// please download ./silero_vad.onnx from`
			`// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models`
			`String model = "./silero_vad.onnx";`
			`SileroVadModelConfig sileroVad =`
			`SileroVadModelConfig.builder()`
			`.setModel(model)`
			`.setThreshold(0.5f)`
			`.setMinSilenceDuration(0.25f)`
			`.setMinSpeechDuration(0.5f)`
			`.setWindowSize(windowSize)`
			`.build();`

			`VadModelConfig config =`
			`VadModelConfig.builder()`
			`.setSileroVadModelConfig(sileroVad)`
			`.setSampleRate(sampleRate)`
			`.setNumThreads(1)`
			`.setDebug(true)`
			`.setProvider("cpu")`
			`.build();`

			`Vad vad = new Vad(config);`

			`// https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html`
			`// Linear PCM, 16000Hz, 16-bit, 1 channel, signed, little endian`
			`AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false);`

			`// https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/DataLine.Info.html#Info-java.lang.Class-javax.sound.sampled.AudioFormat-int-`
			`DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);`
			`TargetDataLine targetDataLine;`
			`try {`
			`targetDataLine = (TargetDataLine) AudioSystem.getLine(info);`
			`targetDataLine.open(format);`
			`targetDataLine.start();`
			`} catch (LineUnavailableException e) {`
			`System.out.println("Failed to open target data line: " + e.getMessage());`
			`vad.release();`
			`return;`
			`}`

			`boolean printed = false;`
			`int index = 0;`

			`byte[] buffer = new byte[windowSize * 2];`
			`float[] samples = new float[windowSize];`

			`while (targetDataLine.isOpen()) {`
			`int n = targetDataLine.read(buffer, 0, buffer.length);`
			`if (n <= 0) {`
			`System.out.printf("Got %d bytes. Expected %d bytes.\n", n, buffer.length);`
			`continue;`
			`}`
			`for (int i = 0; i != windowSize; ++i) {`
			`short low = buffer[2 * i];`
			`short high = buffer[2 * i + 1];`
			`int s = (high << 8) + low;`
			`samples[i] = (float) s / 32768;`
			`}`

			`vad.acceptWaveform(samples);`
			`if (vad.isSpeechDetected() && !printed) {`
			`System.out.println("Detected speech");`
			`printed = true;`
			`}`

			`if (!vad.isSpeechDetected()) {`
			`printed = false;`
			`}`

			`while (!vad.empty()) {`
			`float[] segment = vad.front().getSamples();`
			`float duration = segment.length / (float) sampleRate;`
			`System.out.printf("Duration: %.3f seconds\n", duration);`

			`String filename = String.format("seg-%d-%.3fs.wav", index, duration);`
			`index += 1;`
			`WaveWriter.write(filename, segment, sampleRate);`
			`System.out.printf("Saved to %s\n", filename);`
			`System.out.println("----------");`
			`vad.pop();`
			`}`
			`}`

			`vad.release();`
			`}`
			`}`