Java API for speaker diarization (#1416)
This commit is contained in:
7
.github/workflows/run-java-test.yaml
vendored
7
.github/workflows/run-java-test.yaml
vendored
@@ -107,6 +107,13 @@ jobs:
|
||||
make -j4
|
||||
ls -lh lib
|
||||
|
||||
- name: Run java test (speaker diarization)
|
||||
shell: bash
|
||||
run: |
|
||||
cd ./java-api-examples
|
||||
./run-offline-speaker-diarization.sh
|
||||
rm -rfv *.onnx *.wav sherpa-onnx-pyannote-*
|
||||
|
||||
- name: Run java test (kws)
|
||||
shell: bash
|
||||
run: |
|
||||
|
||||
99
java-api-examples/OfflineSpeakerDiarizationDemo.java
Normal file
99
java-api-examples/OfflineSpeakerDiarizationDemo.java
Normal file
@@ -0,0 +1,99 @@
|
||||
// Copyright 2024 Xiaomi Corporation
|
||||
|
||||
// This file shows how to use sherpa-onnx Java API for speaker diarization,
|
||||
import com.k2fsa.sherpa.onnx.*;
|
||||
|
||||
public class OfflineSpeakerDiarizationDemo {
|
||||
public static void main(String[] args) {
|
||||
/* Please use the following commands to download files used in this file
|
||||
Step 1: Download a speaker segmentation model
|
||||
|
||||
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
|
||||
for a list of available models. The following is an example
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||
|
||||
Step 2: Download a speaker embedding extractor model
|
||||
|
||||
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
|
||||
for a list of available models. The following is an example
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
||||
|
||||
Step 3. Download test wave files
|
||||
|
||||
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
|
||||
for a list of available test wave files. The following is an example
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
|
||||
|
||||
Step 4. Run it
|
||||
*/
|
||||
|
||||
String segmentationModel = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx";
|
||||
String embeddingModel = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx";
|
||||
String waveFilename = "./0-four-speakers-zh.wav";
|
||||
|
||||
WaveReader reader = new WaveReader(waveFilename);
|
||||
|
||||
OfflineSpeakerSegmentationPyannoteModelConfig pyannote =
|
||||
OfflineSpeakerSegmentationPyannoteModelConfig.builder().setModel(segmentationModel).build();
|
||||
|
||||
OfflineSpeakerSegmentationModelConfig segmentation =
|
||||
OfflineSpeakerSegmentationModelConfig.builder()
|
||||
.setPyannote(pyannote)
|
||||
.setDebug(true)
|
||||
.build();
|
||||
|
||||
SpeakerEmbeddingExtractorConfig embedding =
|
||||
SpeakerEmbeddingExtractorConfig.builder().setModel(embeddingModel).setDebug(true).build();
|
||||
|
||||
// The test wave file ./0-four-speakers-zh.wav contains four speakers, so
|
||||
// we use numClusters=4 here. If you don't know the number of speakers
|
||||
// in the test wave file, please set the numClusters to -1 and provide
|
||||
// threshold for clustering
|
||||
FastClusteringConfig clustering =
|
||||
FastClusteringConfig.builder()
|
||||
.setNumClusters(4) // set it to -1 if you don't know the actual number
|
||||
.setThreshold(0.5f)
|
||||
.build();
|
||||
|
||||
OfflineSpeakerDiarizationConfig config =
|
||||
OfflineSpeakerDiarizationConfig.builder()
|
||||
.setSegmentation(segmentation)
|
||||
.setEmbedding(embedding)
|
||||
.setClustering(clustering)
|
||||
.setMinDurationOn(0.2f)
|
||||
.setMinDurationOff(0.5f)
|
||||
.build();
|
||||
|
||||
OfflineSpeakerDiarization sd = new OfflineSpeakerDiarization(config);
|
||||
if (sd.getSampleRate() != reader.getSampleRate()) {
|
||||
System.out.printf(
|
||||
"Expected sample rate: %d, given: %d\n", sd.getSampleRate(), reader.getSampleRate());
|
||||
return;
|
||||
}
|
||||
|
||||
// OfflineSpeakerDiarizationSegment[] segments = sd.process(reader.getSamples());
|
||||
// without callback is also ok
|
||||
|
||||
// or you can use a callback to show the progress
|
||||
OfflineSpeakerDiarizationSegment[] segments =
|
||||
sd.processWithCallback(
|
||||
reader.getSamples(),
|
||||
(int numProcessedChunks, int numTotalChunks, long arg) -> {
|
||||
float progress = 100.0f * numProcessedChunks / numTotalChunks;
|
||||
System.out.printf("Progress: %.2f%%\n", progress);
|
||||
|
||||
return 0;
|
||||
});
|
||||
|
||||
for (OfflineSpeakerDiarizationSegment s : segments) {
|
||||
System.out.printf("%.3f -- %.3f speaker_%02d\n", s.getStart(), s.getEnd(), s.getSpeaker());
|
||||
}
|
||||
|
||||
sd.release();
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,12 @@ This directory contains examples for the JAVA API of sherpa-onnx.
|
||||
|
||||
# Usage
|
||||
|
||||
## Non-streaming speaker diarization
|
||||
|
||||
```bash
|
||||
./run-offline-speaker-diarization.sh
|
||||
```
|
||||
|
||||
## Streaming Speech recognition
|
||||
|
||||
```
|
||||
|
||||
45
java-api-examples/run-offline-speaker-diarization.sh
Executable file
45
java-api-examples/run-offline-speaker-diarization.sh
Executable file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
|
||||
mkdir -p ../build
|
||||
pushd ../build
|
||||
cmake \
|
||||
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||
-DBUILD_SHARED_LIBS=ON \
|
||||
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_JNI=ON \
|
||||
..
|
||||
|
||||
make -j4
|
||||
ls -lh lib
|
||||
popd
|
||||
fi
|
||||
|
||||
if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
|
||||
pushd ../sherpa-onnx/java-api
|
||||
make
|
||||
popd
|
||||
fi
|
||||
|
||||
if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||
fi
|
||||
|
||||
if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
||||
fi
|
||||
|
||||
if [ ! -f ./0-four-speakers-zh.wav ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
|
||||
fi
|
||||
|
||||
java \
|
||||
-Djava.library.path=$PWD/../build/lib \
|
||||
-cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
|
||||
./OfflineSpeakerDiarizationDemo.java
|
||||
@@ -68,6 +68,15 @@ java_files += KeywordSpotterConfig.java
|
||||
java_files += KeywordSpotterResult.java
|
||||
java_files += KeywordSpotter.java
|
||||
|
||||
java_files += OfflineSpeakerSegmentationPyannoteModelConfig.java
|
||||
java_files += OfflineSpeakerSegmentationModelConfig.java
|
||||
java_files += FastClusteringConfig.java
|
||||
java_files += OfflineSpeakerDiarizationConfig.java
|
||||
java_files += OfflineSpeakerDiarizationSegment.java
|
||||
java_files += OfflineSpeakerDiarizationCallback.java
|
||||
java_files += OfflineSpeakerDiarization.java
|
||||
|
||||
|
||||
class_files := $(java_files:%.java=%.class)
|
||||
|
||||
java_files := $(addprefix src/$(package_dir)/,$(java_files))
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
// Copyright 2024 Xiaomi Corporation
|
||||
|
||||
package com.k2fsa.sherpa.onnx;
|
||||
|
||||
public class FastClusteringConfig {
|
||||
private final int numClusters;
|
||||
private final float threshold;
|
||||
|
||||
private FastClusteringConfig(Builder builder) {
|
||||
this.numClusters = builder.numClusters;
|
||||
this.threshold = builder.threshold;
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
public int getNumClusters() {
|
||||
return numClusters;
|
||||
}
|
||||
|
||||
public float getThreshold() {
|
||||
return threshold;
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
private int numClusters = -1;
|
||||
private float threshold = 0.5f;
|
||||
|
||||
public FastClusteringConfig build() {
|
||||
return new FastClusteringConfig(this);
|
||||
}
|
||||
|
||||
public Builder setNumClusters(int numClusters) {
|
||||
this.numClusters = numClusters;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setThreshold(float threshold) {
|
||||
this.threshold = threshold;
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
// Copyright 2024 Xiaomi Corporation
|
||||
|
||||
package com.k2fsa.sherpa.onnx;
|
||||
|
||||
public class OfflineSpeakerDiarization {
|
||||
static {
|
||||
System.loadLibrary("sherpa-onnx-jni");
|
||||
}
|
||||
|
||||
private long ptr = 0;
|
||||
|
||||
public OfflineSpeakerDiarization(OfflineSpeakerDiarizationConfig config) {
|
||||
ptr = newFromFile(config);
|
||||
}
|
||||
|
||||
public int getSampleRate() {
|
||||
return getSampleRate(ptr);
|
||||
}
|
||||
|
||||
// Only config.clustering is used. All other fields are ignored
|
||||
public void setConfig(OfflineSpeakerDiarizationConfig config) {
|
||||
setConfig(ptr, config);
|
||||
}
|
||||
|
||||
public OfflineSpeakerDiarizationSegment[] process(float[] samples) {
|
||||
return process(ptr, samples);
|
||||
}
|
||||
|
||||
public OfflineSpeakerDiarizationSegment[] processWithCallback(float[] samples, OfflineSpeakerDiarizationCallback callback) {
|
||||
return processWithCallback(ptr, samples, callback, 0);
|
||||
}
|
||||
|
||||
public OfflineSpeakerDiarizationSegment[] processWithCallback(float[] samples, OfflineSpeakerDiarizationCallback callback, long arg) {
|
||||
return processWithCallback(ptr, samples, callback, arg);
|
||||
}
|
||||
|
||||
protected void finalize() throws Throwable {
|
||||
release();
|
||||
}
|
||||
|
||||
// You'd better call it manually if it is not used anymore
|
||||
public void release() {
|
||||
if (this.ptr == 0) {
|
||||
return;
|
||||
}
|
||||
delete(this.ptr);
|
||||
this.ptr = 0;
|
||||
}
|
||||
|
||||
private native int getSampleRate(long ptr);
|
||||
|
||||
private native void delete(long ptr);
|
||||
|
||||
private native long newFromFile(OfflineSpeakerDiarizationConfig config);
|
||||
|
||||
private native void setConfig(long ptr, OfflineSpeakerDiarizationConfig config);
|
||||
|
||||
private native OfflineSpeakerDiarizationSegment[] process(long ptr, float[] samples);
|
||||
|
||||
private native OfflineSpeakerDiarizationSegment[] processWithCallback(long ptr, float[] samples, OfflineSpeakerDiarizationCallback callback, long arg);
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
// Copyright 2024 Xiaomi Corporation
|
||||
|
||||
package com.k2fsa.sherpa.onnx;
|
||||
|
||||
@FunctionalInterface
|
||||
public interface OfflineSpeakerDiarizationCallback {
|
||||
Integer invoke(int numProcessedChunks, int numTotalCunks, long arg);
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
package com.k2fsa.sherpa.onnx;
|
||||
|
||||
public class OfflineSpeakerDiarizationConfig {
|
||||
private final OfflineSpeakerSegmentationModelConfig segmentation;
|
||||
private final SpeakerEmbeddingExtractorConfig embedding;
|
||||
private final FastClusteringConfig clustering;
|
||||
private final float minDurationOn;
|
||||
private final float minDurationOff;
|
||||
|
||||
private OfflineSpeakerDiarizationConfig(Builder builder) {
|
||||
this.segmentation = builder.segmentation;
|
||||
this.embedding = builder.embedding;
|
||||
this.clustering = builder.clustering;
|
||||
this.minDurationOff = builder.minDurationOff;
|
||||
this.minDurationOn = builder.minDurationOn;
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
public OfflineSpeakerSegmentationModelConfig getSegmentation() {
|
||||
return segmentation;
|
||||
}
|
||||
|
||||
public SpeakerEmbeddingExtractorConfig getEmbedding() {
|
||||
return embedding;
|
||||
}
|
||||
|
||||
public FastClusteringConfig getClustering() {
|
||||
return clustering;
|
||||
}
|
||||
|
||||
public float getMinDurationOff() {
|
||||
return minDurationOff;
|
||||
}
|
||||
|
||||
public float getMinDurationOn() {
|
||||
return minDurationOn;
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
private OfflineSpeakerSegmentationModelConfig segmentation = OfflineSpeakerSegmentationModelConfig.builder().build();
|
||||
private SpeakerEmbeddingExtractorConfig embedding = SpeakerEmbeddingExtractorConfig.builder().build();
|
||||
private FastClusteringConfig clustering = FastClusteringConfig.builder().build();
|
||||
private float minDurationOn = 0.2f;
|
||||
private float minDurationOff = 0.5f;
|
||||
|
||||
public OfflineSpeakerDiarizationConfig build() {
|
||||
return new OfflineSpeakerDiarizationConfig(this);
|
||||
}
|
||||
|
||||
public Builder setSegmentation(OfflineSpeakerSegmentationModelConfig segmentation) {
|
||||
this.segmentation = segmentation;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setEmbedding(SpeakerEmbeddingExtractorConfig embedding) {
|
||||
this.embedding = embedding;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setClustering(FastClusteringConfig clustering) {
|
||||
this.clustering = clustering;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setMinDurationOff(float minDurationOff) {
|
||||
this.minDurationOff = minDurationOff;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setMinDurationOn(float minDurationOn) {
|
||||
this.minDurationOn = minDurationOn;
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
// Copyright 2024 Xiaomi Corporation
|
||||
|
||||
package com.k2fsa.sherpa.onnx;
|
||||
|
||||
public class OfflineSpeakerDiarizationSegment {
|
||||
private final float start;
|
||||
private final float end;
|
||||
private final int speaker;
|
||||
|
||||
public OfflineSpeakerDiarizationSegment(float start, float end, int speaker) {
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.speaker = speaker;
|
||||
}
|
||||
|
||||
public float getStart() {
|
||||
return start;
|
||||
}
|
||||
|
||||
public float getEnd() {
|
||||
return end;
|
||||
}
|
||||
|
||||
public int getSpeaker() {
|
||||
return speaker;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
// Copyright 2024 Xiaomi Corporation
|
||||
|
||||
package com.k2fsa.sherpa.onnx;
|
||||
|
||||
public class OfflineSpeakerSegmentationModelConfig {
|
||||
private final OfflineSpeakerSegmentationPyannoteModelConfig pyannote;
|
||||
private final int numThreads;
|
||||
private final boolean debug;
|
||||
private final String provider;
|
||||
|
||||
private OfflineSpeakerSegmentationModelConfig(Builder builder) {
|
||||
this.pyannote = builder.pyannote;
|
||||
this.numThreads = builder.numThreads;
|
||||
this.debug = builder.debug;
|
||||
this.provider = builder.provider;
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
private OfflineSpeakerSegmentationPyannoteModelConfig pyannote = OfflineSpeakerSegmentationPyannoteModelConfig.builder().build();
|
||||
private int numThreads = 1;
|
||||
private boolean debug = true;
|
||||
private String provider = "cpu";
|
||||
|
||||
public OfflineSpeakerSegmentationModelConfig build() {
|
||||
return new OfflineSpeakerSegmentationModelConfig(this);
|
||||
}
|
||||
|
||||
public Builder setPyannote(OfflineSpeakerSegmentationPyannoteModelConfig pyannote) {
|
||||
this.pyannote = pyannote;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setNumThreads(int numThreads) {
|
||||
this.numThreads = numThreads;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setDebug(boolean debug) {
|
||||
this.debug = debug;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setProvider(String provider) {
|
||||
this.provider = provider;
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
// Copyright 2024 Xiaomi Corporation
|
||||
|
||||
package com.k2fsa.sherpa.onnx;
|
||||
|
||||
public class OfflineSpeakerSegmentationPyannoteModelConfig {
|
||||
private final String model;
|
||||
|
||||
private OfflineSpeakerSegmentationPyannoteModelConfig(Builder builder) {
|
||||
this.model = builder.model;
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
public String getModel() {
|
||||
return model;
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
private String model = "";
|
||||
|
||||
public OfflineSpeakerSegmentationPyannoteModelConfig build() {
|
||||
return new OfflineSpeakerSegmentationPyannoteModelConfig(this);
|
||||
}
|
||||
|
||||
public Builder setModel(String model) {
|
||||
this.model = model;
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,5 @@
|
||||
// Copyright 2024 Xiaomi Corporation
|
||||
|
||||
package com.k2fsa.sherpa.onnx;
|
||||
|
||||
@FunctionalInterface
|
||||
|
||||
@@ -50,5 +50,4 @@ public class SpeakerEmbeddingExtractorConfig {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user