100 lines
4.0 KiB
Java
100 lines
4.0 KiB
Java
// Copyright 2024 Xiaomi Corporation
|
|
|
|
// This file shows how to use sherpa-onnx Java API for speaker diarization,
|
|
import com.k2fsa.sherpa.onnx.*;
|
|
|
|
public class OfflineSpeakerDiarizationDemo {
|
|
public static void main(String[] args) {
|
|
/* Please use the following commands to download files used in this file
|
|
Step 1: Download a speaker segmentation model
|
|
|
|
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
|
|
for a list of available models. The following is an example
|
|
|
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
|
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
|
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
|
|
|
Step 2: Download a speaker embedding extractor model
|
|
|
|
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
|
|
for a list of available models. The following is an example
|
|
|
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
|
|
|
Step 3. Download test wave files
|
|
|
|
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
|
|
for a list of available test wave files. The following is an example
|
|
|
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
|
|
|
|
Step 4. Run it
|
|
*/
|
|
|
|
String segmentationModel = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx";
|
|
String embeddingModel = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx";
|
|
String waveFilename = "./0-four-speakers-zh.wav";
|
|
|
|
WaveReader reader = new WaveReader(waveFilename);
|
|
|
|
OfflineSpeakerSegmentationPyannoteModelConfig pyannote =
|
|
OfflineSpeakerSegmentationPyannoteModelConfig.builder().setModel(segmentationModel).build();
|
|
|
|
OfflineSpeakerSegmentationModelConfig segmentation =
|
|
OfflineSpeakerSegmentationModelConfig.builder()
|
|
.setPyannote(pyannote)
|
|
.setDebug(true)
|
|
.build();
|
|
|
|
SpeakerEmbeddingExtractorConfig embedding =
|
|
SpeakerEmbeddingExtractorConfig.builder().setModel(embeddingModel).setDebug(true).build();
|
|
|
|
// The test wave file ./0-four-speakers-zh.wav contains four speakers, so
|
|
// we use numClusters=4 here. If you don't know the number of speakers
|
|
// in the test wave file, please set the numClusters to -1 and provide
|
|
// threshold for clustering
|
|
FastClusteringConfig clustering =
|
|
FastClusteringConfig.builder()
|
|
.setNumClusters(4) // set it to -1 if you don't know the actual number
|
|
.setThreshold(0.5f)
|
|
.build();
|
|
|
|
OfflineSpeakerDiarizationConfig config =
|
|
OfflineSpeakerDiarizationConfig.builder()
|
|
.setSegmentation(segmentation)
|
|
.setEmbedding(embedding)
|
|
.setClustering(clustering)
|
|
.setMinDurationOn(0.2f)
|
|
.setMinDurationOff(0.5f)
|
|
.build();
|
|
|
|
OfflineSpeakerDiarization sd = new OfflineSpeakerDiarization(config);
|
|
if (sd.getSampleRate() != reader.getSampleRate()) {
|
|
System.out.printf(
|
|
"Expected sample rate: %d, given: %d\n", sd.getSampleRate(), reader.getSampleRate());
|
|
return;
|
|
}
|
|
|
|
// OfflineSpeakerDiarizationSegment[] segments = sd.process(reader.getSamples());
|
|
// without callback is also ok
|
|
|
|
// or you can use a callback to show the progress
|
|
OfflineSpeakerDiarizationSegment[] segments =
|
|
sd.processWithCallback(
|
|
reader.getSamples(),
|
|
(int numProcessedChunks, int numTotalChunks, long arg) -> {
|
|
float progress = 100.0f * numProcessedChunks / numTotalChunks;
|
|
System.out.printf("Progress: %.2f%%\n", progress);
|
|
|
|
return 0;
|
|
});
|
|
|
|
for (OfflineSpeakerDiarizationSegment s : segments) {
|
|
System.out.printf("%.3f -- %.3f speaker_%02d\n", s.getStart(), s.getEnd(), s.getSpeaker());
|
|
}
|
|
|
|
sd.release();
|
|
}
|
|
}
|