C++ API for speaker diarization (#1396)

2024-10-09 12:01:20 +08:00
parent 70165cb42d
commit 59407edcad
39 changed files with 1652 additions and 108 deletions
--- a/.github/scripts/test-speaker-diarization.sh
+++ b/.github/scripts/test-speaker-diarization.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+set -ex
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+echo "EXE is $EXE"
+echo "PATH: $PATH"
+
+which $EXE
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
+
+log "specify number of clusters"
+$EXE \
+  --clustering.num-clusters=4 \
+  --segmentation.pyannote-model=./sherpa-onnx-pyannote-segmentation-3-0/model.onnx \
+  --embedding.model=./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx \
+  ./0-four-speakers-zh.wav
+
+log "specify threshold for clustering"
+
+$EXE \
+  --clustering.cluster-threshold=0.90 \
+  --segmentation.pyannote-model=./sherpa-onnx-pyannote-segmentation-3-0/model.onnx \
+  --embedding.model=./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx \
+  ./0-four-speakers-zh.wav
+
+rm -rf sherpa-onnx-pyannote-*
+rm -fv *.onnx
+rm -fv *.wav
--- a/.github/workflows/export-pyannote-segmentation-to-onnx.yaml
+++ b/.github/workflows/export-pyannote-segmentation-to-onnx.yaml
@@ -29,7 +29,7 @@ jobs:
      - name: Install pyannote
        shell: bash
        run: |
-          pip install pyannote.audio onnx onnxruntime
+          pip install pyannote.audio onnx==1.15.0 onnxruntime==1.16.3

      - name: Run
        shell: bash
--- a/.github/workflows/linux.yaml
+++ b/.github/workflows/linux.yaml
@@ -18,6 +18,7 @@ on:
      - '.github/scripts/test-audio-tagging.sh'
      - '.github/scripts/test-offline-punctuation.sh'
      - '.github/scripts/test-online-punctuation.sh'
+      - '.github/scripts/test-speaker-diarization.sh'
      - 'CMakeLists.txt'
      - 'cmake/**'
      - 'sherpa-onnx/csrc/*'
@@ -38,6 +39,7 @@ on:
      - '.github/scripts/test-audio-tagging.sh'
      - '.github/scripts/test-offline-punctuation.sh'
      - '.github/scripts/test-online-punctuation.sh'
+      - '.github/scripts/test-speaker-diarization.sh'
      - 'CMakeLists.txt'
      - 'cmake/**'
      - 'sherpa-onnx/csrc/*'
@@ -143,6 +145,15 @@ jobs:
          name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }}
          path: install/*

+      - name: Test offline speaker diarization
+        shell: bash
+        run: |
+          du -h -d1 .
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx-offline-speaker-diarization
+
+          .github/scripts/test-speaker-diarization.sh
+
      - name: Test offline transducer
        shell: bash
        run: |
--- a/.github/workflows/macos.yaml
+++ b/.github/workflows/macos.yaml
@@ -18,6 +18,7 @@ on:
      - '.github/scripts/test-audio-tagging.sh'
      - '.github/scripts/test-offline-punctuation.sh'
      - '.github/scripts/test-online-punctuation.sh'
+      - '.github/scripts/test-speaker-diarization.sh'
      - 'CMakeLists.txt'
      - 'cmake/**'
      - 'sherpa-onnx/csrc/*'
@@ -37,6 +38,7 @@ on:
      - '.github/scripts/test-audio-tagging.sh'
      - '.github/scripts/test-offline-punctuation.sh'
      - '.github/scripts/test-online-punctuation.sh'
+      - '.github/scripts/test-speaker-diarization.sh'
      - 'CMakeLists.txt'
      - 'cmake/**'
      - 'sherpa-onnx/csrc/*'
@@ -115,6 +117,15 @@ jobs:
          otool -L build/bin/sherpa-onnx
          otool -l build/bin/sherpa-onnx

+      - name: Test offline speaker diarization
+        shell: bash
+        run: |
+          du -h -d1 .
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx-offline-speaker-diarization
+
+          .github/scripts/test-speaker-diarization.sh
+
      - name: Test offline transducer
        shell: bash
        run: |
--- a/.github/workflows/speaker-diarization.yaml
+++ b/.github/workflows/speaker-diarization.yaml
@@ -67,7 +67,7 @@ jobs:
          curl -SL -O https://huggingface.co/csukuangfj/pyannote-models/resolve/main/segmentation-3.0/pytorch_model.bin

          test_wavs=(
-            0-two-speakers-zh.wav
+            0-four-speakers-zh.wav
            1-two-speakers-en.wav
            2-two-speakers-en.wav
            3-two-speakers-en.wav
--- a/.github/workflows/windows-x64.yaml
+++ b/.github/workflows/windows-x64.yaml
@@ -17,6 +17,7 @@ on:
      - '.github/scripts/test-audio-tagging.sh'
      - '.github/scripts/test-offline-punctuation.sh'
      - '.github/scripts/test-online-punctuation.sh'
+      - '.github/scripts/test-speaker-diarization.sh'
      - 'CMakeLists.txt'
      - 'cmake/**'
      - 'sherpa-onnx/csrc/*'
@@ -34,6 +35,7 @@ on:
      - '.github/scripts/test-audio-tagging.sh'
      - '.github/scripts/test-offline-punctuation.sh'
      - '.github/scripts/test-online-punctuation.sh'
+      - '.github/scripts/test-speaker-diarization.sh'
      - 'CMakeLists.txt'
      - 'cmake/**'
      - 'sherpa-onnx/csrc/*'
@@ -87,6 +89,15 @@ jobs:
          name: release-windows-x64-${{ matrix.shared_lib }}-${{ matrix.with_tts }}
          path: build/install/*

+      - name: Test offline speaker diarization
+        shell: bash
+        run: |
+          du -h -d1 .
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx-offline-speaker-diarization.exe
+
+          .github/scripts/test-speaker-diarization.sh
+
      - name: Test online punctuation
        shell: bash
        run: |
--- a/.github/workflows/windows-x86.yaml
+++ b/.github/workflows/windows-x86.yaml
@@ -17,6 +17,7 @@ on:
      - '.github/scripts/test-audio-tagging.sh'
      - '.github/scripts/test-offline-punctuation.sh'
      - '.github/scripts/test-online-punctuation.sh'
+      - '.github/scripts/test-speaker-diarization.sh'
      - 'CMakeLists.txt'
      - 'cmake/**'
      - 'sherpa-onnx/csrc/*'
@@ -34,6 +35,7 @@ on:
      - '.github/scripts/test-audio-tagging.sh'
      - '.github/scripts/test-offline-punctuation.sh'
      - '.github/scripts/test-online-punctuation.sh'
+      - '.github/scripts/test-speaker-diarization.sh'
      - 'CMakeLists.txt'
      - 'cmake/**'
      - 'sherpa-onnx/csrc/*'
@@ -87,6 +89,15 @@ jobs:
          name: release-windows-x86-${{ matrix.shared_lib }}-${{ matrix.with_tts }}
          path: build/install/*

+      - name: Test offline speaker diarization
+        shell: bash
+        run: |
+          du -h -d1 .
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx-offline-speaker-diarization.exe
+
+          .github/scripts/test-speaker-diarization.sh
+
      - name: Test online punctuation
        shell: bash
        run: |