Kotlin API for speaker diarization (#1415)

2024-10-11 14:41:53 +08:00
parent eefc172095
commit 2d412b1190
7 changed files with 412 additions and 1 deletions
--- a/kotlin-api-examples/OfflineSpeakerDiarization.kt
+++ b/kotlin-api-examples/OfflineSpeakerDiarization.kt
@@ -0,0 +1 @@
+../sherpa-onnx/kotlin-api/OfflineSpeakerDiarization.kt
--- a/kotlin-api-examples/run.sh
+++ b/kotlin-api-examples/run.sh
@@ -285,6 +285,37 @@ function testPunctuation() {
  java -Djava.library.path=../build/lib -jar $out_filename
 }

+function testOfflineSpeakerDiarization() {
+  if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
+    curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+    tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+    rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+  fi
+
+  if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
+    curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+  fi
+
+  if [ ! -f ./0-four-speakers-zh.wav ]; then
+    curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
+  fi
+
+  out_filename=test_offline_speaker_diarization.jar
+  kotlinc-jvm -include-runtime -d $out_filename \
+    test_offline_speaker_diarization.kt \
+    OfflineSpeakerDiarization.kt \
+    Speaker.kt \
+    OnlineStream.kt \
+    WaveReader.kt \
+    faked-asset-manager.kt \
+    faked-log.kt
+
+  ls -lh $out_filename
+
+  java -Djava.library.path=../build/lib -jar $out_filename
+}
+
+testOfflineSpeakerDiarization
 testSpeakerEmbeddingExtractor
 testOnlineAsr
 testTts
--- a/kotlin-api-examples/test_offline_speaker_diarization.kt
+++ b/kotlin-api-examples/test_offline_speaker_diarization.kt
@@ -0,0 +1,53 @@
+package com.k2fsa.sherpa.onnx
+
+fun main() {
+  testOfflineSpeakerDiarization()
+}
+
+fun callback(numProcessedChunks: Int, numTotalChunks: Int, arg: Long): Int {
+  val progress = numProcessedChunks.toFloat() / numTotalChunks * 100
+  val s = "%.2f".format(progress)
+  println("Progress: ${s}%");
+
+  return 0
+}
+
+fun testOfflineSpeakerDiarization() {
+  var config = OfflineSpeakerDiarizationConfig(
+    segmentation=OfflineSpeakerSegmentationModelConfig(
+      pyannote=OfflineSpeakerSegmentationPyannoteModelConfig("./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"),
+    ),
+    embedding=SpeakerEmbeddingExtractorConfig(
+      model="./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx",
+    ),
+
+    // The test wave file ./0-four-speakers-zh.wav contains four speakers, so
+    // we use numClusters=4 here. If you don't know the number of speakers
+    // in the test wave file, please set the threshold like below.
+    //
+    // clustering=FastClusteringConfig(threshold=0.5),
+    //
+    // WARNING: You need to tune threshold by yourself.
+    // A larger threshold leads to fewer clusters, i.e., few speakers.
+    // A smaller threshold leads to more clusters, i.e., more speakers.
+    //
+    clustering=FastClusteringConfig(numClusters=4),
+  )
+
+  val sd = OfflineSpeakerDiarization(config=config)
+
+  val waveData = WaveReader.readWave(
+      filename = "./0-four-speakers-zh.wav",
+  )
+
+  if (sd.sampleRate() != waveData.sampleRate) {
+    println("Expected sample rate: ${sd.sampleRate()}, given: ${waveData.sampleRate}")
+    return
+  }
+
+  // val segments = sd.process(waveData.samples) // this one is also ok
+  val segments = sd.processWithCallback(waveData.samples, callback=::callback)
+  for (segment in segments) {
+    println("${segment.start} -- ${segment.end} speaker_${segment.speaker}")
+  }
+}
				`@@ -0,0 +1 @@`
				`../sherpa-onnx/kotlin-api/OfflineSpeakerDiarization.kt`