Export Pyannote speaker segmentation models to onnx (#1382)

2024-09-29 14:23:56 +08:00
parent 11f0cb7e1c
commit bc08160820
9 changed files with 707 additions and 0 deletions
--- a/.github/workflows/export-pyannote-segmentation-to-onnx.yaml
+++ b/.github/workflows/export-pyannote-segmentation-to-onnx.yaml
@@ -0,0 +1,86 @@
+name: export-pyannote-segmentation-to-onnx
+
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: export-pyannote-segmentation-to-onnx-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  export-pyannote-segmentation-to-onnx:
+    if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
+    name: export Pyannote segmentation models to ONNX
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [macos-latest]
+        python-version: ["3.10"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install pyannote
+        shell: bash
+        run: |
+          pip install pyannote.audio onnx onnxruntime
+
+      - name: Run
+        shell: bash
+        run: |
+          d=sherpa-onnx-pyannote-segmentation-3-0
+          src=$PWD/$d
+          mkdir -p $src
+
+          pushd scripts/pyannote/segmentation
+          ./run.sh
+          cp ./*.onnx $src/
+          cp ./README.md $src/
+          cp ./LICENSE $src/
+          cp ./run.sh $src/
+          cp ./*.py $src/
+
+          popd
+          ls -lh $d
+          tar cjfv $d.tar.bz2 $d
+
+      - name: Release
+        uses: svenstaro/upload-release-action@v2
+        with:
+          file_glob: true
+          file: ./*.tar.bz2
+          overwrite: true
+          repo_name: k2-fsa/sherpa-onnx
+          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
+          tag: speaker-segmentation-models
+
+      - name: Publish to huggingface
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 20
+          timeout_seconds: 200
+          shell: bash
+          command: |
+            git config --global user.email "csukuangfj@gmail.com"
+            git config --global user.name "Fangjun Kuang"
+
+            d=sherpa-onnx-pyannote-segmentation-3-0
+            export GIT_LFS_SKIP_SMUDGE=1
+            export GIT_CLONE_PROTECTION_ACTIVE=false
+            git clone https://huggingface.co/csukuangfj/$d huggingface
+            cp -v $d/* ./huggingface
+            cd huggingface
+            git lfs track "*.onnx"
+            git status
+            git add .
+            git status
+            git commit -m "add models"
+            git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d main
--- a/scripts/pyannote/segmentation/.gitignore
+++ b/scripts/pyannote/segmentation/.gitignore
@@ -0,0 +1,2 @@
+*.bin
+*.onnx
--- a/scripts/pyannote/segmentation/export-onnx.py
+++ b/scripts/pyannote/segmentation/export-onnx.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+
+from typing import Any, Dict
+
+import onnx
+import torch
+from onnxruntime.quantization import QuantType, quantize_dynamic
+from pyannote.audio import Model
+from pyannote.audio.core.task import Problem, Resolution
+
+
+def add_meta_data(filename: str, meta_data: Dict[str, Any]):
+    """Add meta data to an ONNX model. It is changed in-place.
+
+    Args:
+      filename:
+        Filename of the ONNX model to be changed.
+      meta_data:
+        Key-value pairs.
+    """
+    model = onnx.load(filename)
+
+    while len(model.metadata_props):
+        model.metadata_props.pop()
+
+    for key, value in meta_data.items():
+        meta = model.metadata_props.add()
+        meta.key = key
+        meta.value = str(value)
+
+    onnx.save(model, filename)
+
+
+@torch.no_grad()
+def main():
+    # You can download ./pytorch_model.bin from
+    # https://hf-mirror.com/csukuangfj/pyannote-models/tree/main/segmentation-3.0
+    pt_filename = "./pytorch_model.bin"
+    model = Model.from_pretrained(pt_filename)
+    model.eval()
+    assert model.dimension == 7, model.dimension
+    print(model.specifications)
+
+    assert (
+        model.specifications.problem == Problem.MONO_LABEL_CLASSIFICATION
+    ), model.specifications.problem
+
+    assert (
+        model.specifications.resolution == Resolution.FRAME
+    ), model.specifications.resolution
+
+    assert model.specifications.duration == 10.0, model.specifications.duration
+
+    assert model.audio.sample_rate == 16000, model.audio.sample_rate
+
+    # (batch, num_channels, num_samples)
+    assert list(model.example_input_array.shape) == [
+        1,
+        1,
+        16000 * 10,
+    ], model.example_input_array.shape
+
+    example_output = model(model.example_input_array)
+
+    # (batch, num_frames, num_classes)
+    assert list(example_output.shape) == [1, 589, 7], example_output.shape
+
+    assert model.receptive_field.step == 0.016875, model.receptive_field.step
+    assert model.receptive_field.duration == 0.0619375, model.receptive_field.duration
+    assert model.receptive_field.step * 16000 == 270, model.receptive_field.step * 16000
+    assert model.receptive_field.duration * 16000 == 991, (
+        model.receptive_field.duration * 16000
+    )
+
+    opset_version = 18
+
+    filename = "model.onnx"
+    torch.onnx.export(
+        model,
+        model.example_input_array,
+        filename,
+        opset_version=opset_version,
+        input_names=["x"],
+        output_names=["y"],
+        dynamic_axes={
+            "x": {0: "N", 2: "T"},
+            "y": {0: "N", 1: "T"},
+        },
+    )
+
+    sample_rate = model.audio.sample_rate
+
+    window_size = int(model.specifications.duration) * 16000
+    receptive_field_size = int(model.receptive_field.duration * 16000)
+    receptive_field_shift = int(model.receptive_field.step * 16000)
+
+    meta_data = {
+        "num_speakers": len(model.specifications.classes),
+        "powerset_max_classes": model.specifications.powerset_max_classes,
+        "num_classes": model.dimension,
+        "sample_rate": sample_rate,
+        "window_size": window_size,
+        "receptive_field_size": receptive_field_size,
+        "receptive_field_shift": receptive_field_shift,
+        "model_type": "pyannote-segmentation-3.0",
+        "version": "1",
+        "model_author": "pyannote",
+        "maintainer": "k2-fsa",
+        "url_1": "https://huggingface.co/pyannote/segmentation-3.0",
+        "url_2": "https://huggingface.co/csukuangfj/pyannote-models/tree/main/segmentation-3.0",
+        "license": "https://huggingface.co/pyannote/segmentation-3.0/blob/main/LICENSE",
+    }
+    add_meta_data(filename=filename, meta_data=meta_data)
+
+    print("Generate int8 quantization models")
+
+    filename_int8 = "model.int8.onnx"
+    quantize_dynamic(
+        model_input=filename,
+        model_output=filename_int8,
+        weight_type=QuantType.QUInt8,
+    )
+
+    print(f"Saved to {filename} and {filename_int8}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/pyannote/segmentation/notes.md
+++ b/scripts/pyannote/segmentation/notes.md
@@ -0,0 +1,78 @@
+
+# config.yaml
+
+
+```yaml
+task:
+  _target_: pyannote.audio.tasks.SpeakerDiarization
+  duration: 10.0
+  max_speakers_per_chunk: 3
+  max_speakers_per_frame: 2
+model:
+  _target_: pyannote.audio.models.segmentation.PyanNet
+  sample_rate: 16000
+  num_channels: 1
+  sincnet:
+    stride: 10
+  lstm:
+    hidden_size: 128
+    num_layers: 4
+    bidirectional: true
+    monolithic: true
+  linear:
+    hidden_size: 128
+    num_layers: 2
+```
+
+# Model architecture of ./pytorch_model.bin
+
+`print(model)`:
+
+```python3
+PyanNet(
+  (sincnet): SincNet(
+    (wav_norm1d): InstanceNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
+    (conv1d): ModuleList(
+      (0): Encoder(
+        (filterbank): ParamSincFB()
+      )
+      (1): Conv1d(80, 60, kernel_size=(5,), stride=(1,))
+      (2): Conv1d(60, 60, kernel_size=(5,), stride=(1,))
+    )
+    (pool1d): ModuleList(
+      (0-2): 3 x MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
+    )
+    (norm1d): ModuleList(
+      (0): InstanceNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
+      (1-2): 2 x InstanceNorm1d(60, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
+    )
+  )
+  (lstm): LSTM(60, 128, num_layers=4, batch_first=True, dropout=0.5, bidirectional=True)
+  (linear): ModuleList(
+    (0): Linear(in_features=256, out_features=128, bias=True)
+    (1): Linear(in_features=128, out_features=128, bias=True)
+  )
+  (classifier): Linear(in_features=128, out_features=7, bias=True)
+  (activation): LogSoftmax(dim=-1)
+)
+```
+
+```python3
+>>> list(model.specifications)
+[Specifications(problem=<Problem.MONO_LABEL_CLASSIFICATION: 1>, resolution=<Resolution.FRAME: 1>, duration=10.0, min_duration=None, warm_up=(0.0, 0.0), classes=['speaker#1', 'speaker#2', 'speaker#3'], powerset_max_classes=2, permutation_invariant=True)]
+```
+
+```python3
+>>> model.hparams
+"linear":       {'hidden_size': 128, 'num_layers': 2}
+"lstm":         {'hidden_size': 128, 'num_layers': 4, 'bidirectional': True, 'monolithic': True, 'dropout': 0.5, 'batch_first': True}
+"num_channels": 1
+"sample_rate":  16000
+"sincnet":      {'stride': 10, 'sample_rate': 16000}
+```
+
+## Papers
+
+- [pyannote.audio 2.1 speaker diarization pipeline: principle, benchmark, and recipe](https://hal.science/hal-04247212/document)
+- [pyannote.audio speaker diarization pipeline at VoxSRC 2023](https://mmai.io/datasets/voxceleb/voxsrc/data_workshop_2023/reports/pyannote_report.pdf)
+
--- a/scripts/pyannote/segmentation/preprocess.sh
+++ b/scripts/pyannote/segmentation/preprocess.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+
+python3 -m onnxruntime.quantization.preprocess --input model.onnx --output tmp.preprocessed.onnx
+mv ./tmp.preprocessed.onnx ./model.onnx
+./show-onnx.py --filename ./model.onnx
+
+<<EOF
+=========./model.onnx==========
+NodeArg(name='x', type='tensor(float)', shape=[1, 1, 'T'])
+-----
+NodeArg(name='y', type='tensor(float)', shape=[1, 'floor(floor(floor(floor(T/10 - 251/10)/3 - 2/3)/3)/3 - 8/3) + 1', 7])
+
+  floor(floor(floor(floor(T/10 - 251/10)/3 - 2/3)/3)/3 - 8/3) + 1
+= floor(floor(floor(floor(T - 251)/30 - 2/3)/3)/3 - 8/3) + 1
+= floor(floor(floor(floor(T - 271)/30)/3)/3 - 8/3) + 1
+= floor(floor(floor(floor(T - 271)/90))/3 - 8/3) + 1
+= floor(floor(floor(T - 271)/90)/3 - 8/3) + 1
+= floor(floor((T - 271)/90)/3 - 8/3) + 1
+= floor(floor((T - 271)/90 - 8)/3) + 1
+= floor(floor((T - 271 - 720)/90)/3) + 1
+= floor(floor((T - 991)/90)/3) + 1
+= floor(floor((T - 991)/270)) + 1
+= (T - 991)/270 + 1
+= (T - 991 + 270)/270
+= (T - 721)/270
+
+It means:
+ - Number of input samples should be at least 721
+ - One frame corresponds to 270 samples. (If we use T + 270, it outputs one more frame)
+EOF
--- a/scripts/pyannote/segmentation/run.sh
+++ b/scripts/pyannote/segmentation/run.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+# Copyright    2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+set -ex
+function install_pyannote() {
+  pip install pyannote.audio onnx onnxruntime
+}
+
+function download_test_files() {
+  curl -SL -O https://huggingface.co/csukuangfj/pyannote-models/resolve/main/segmentation-3.0/pytorch_model.bin
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
+}
+
+install_pyannote
+download_test_files
+
+./export-onnx.py
+./preprocess.sh
+
+echo "----------torch----------"
+./vad-torch.py
+
+echo "----------onnx model.onnx----------"
+./vad-onnx.py --model ./model.onnx --wav ./lei-jun-test.wav
+
+echo "----------onnx model.int8.onnx----------"
+./vad-onnx.py --model ./model.int8.onnx --wav ./lei-jun-test.wav
+
+cat >README.md << EOF
+# Introduction
+
+Models in this file are converted from
+https://huggingface.co/pyannote/segmentation-3.0/tree/main
+
+EOF
+
+cat >LICENSE <<EOF
+MIT License
+
+Copyright (c) 2022 CNRS
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+EOF
--- a/scripts/pyannote/segmentation/show-onnx.py
+++ b/scripts/pyannote/segmentation/show-onnx.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+# Copyright      2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+import onnxruntime
+import argparse
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--filename",
+        type=str,
+        required=True,
+        help="Path to model.onnx",
+    )
+
+    return parser.parse_args()
+
+
+def show(filename):
+    session_opts = onnxruntime.SessionOptions()
+    session_opts.log_severity_level = 3
+    sess = onnxruntime.InferenceSession(filename, session_opts)
+    for i in sess.get_inputs():
+        print(i)
+
+    print("-----")
+
+    for i in sess.get_outputs():
+        print(i)
+
+
+def main():
+    args = get_args()
+    print(f"========={args.filename}==========")
+    show(args.filename)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/pyannote/segmentation/vad-onnx.py
+++ b/scripts/pyannote/segmentation/vad-onnx.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+
+"""
+./export-onnx.py
+./preprocess.sh
+
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
+./vad-onnx.py --model ./model.onnx --wav ./lei-jun-test.wav
+"""
+
+import argparse
+from pathlib import Path
+
+import librosa
+import numpy as np
+import onnxruntime as ort
+import soundfile as sf
+from numpy.lib.stride_tricks import as_strided
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, required=True, help="Path to model.onnx")
+    parser.add_argument("--wav", type=str, required=True, help="Path to test.wav")
+
+    return parser.parse_args()
+
+
+class OnnxModel:
+    def __init__(self, filename):
+        session_opts = ort.SessionOptions()
+        session_opts.inter_op_num_threads = 1
+        session_opts.intra_op_num_threads = 1
+
+        self.session_opts = session_opts
+
+        self.model = ort.InferenceSession(
+            filename,
+            sess_options=self.session_opts,
+            providers=["CPUExecutionProvider"],
+        )
+
+        meta = self.model.get_modelmeta().custom_metadata_map
+        print(meta)
+
+        self.window_size = int(meta["window_size"])
+        self.sample_rate = int(meta["sample_rate"])
+        self.window_shift = int(0.1 * self.window_size)
+        self.receptive_field_size = int(meta["receptive_field_size"])
+        self.receptive_field_shift = int(meta["receptive_field_shift"])
+        self.num_speakers = int(meta["num_speakers"])
+        self.powerset_max_classes = int(meta["powerset_max_classes"])
+        self.num_classes = int(meta["num_classes"])
+
+    def __call__(self, x):
+        """
+        Args:
+          x: (N, num_samples)
+        Returns:
+          A tensor of shape (N, num_frames, num_classes)
+        """
+        x = np.expand_dims(x, axis=1)
+
+        (y,) = self.model.run(
+            [self.model.get_outputs()[0].name], {self.model.get_inputs()[0].name: x}
+        )
+
+        return y
+
+
+def load_wav(filename, expected_sample_rate) -> np.ndarray:
+    audio, sample_rate = sf.read(filename, dtype="float32", always_2d=True)
+    audio = audio[:, 0]  # only use the first channel
+    if sample_rate != expected_sample_rate:
+        audio = librosa.resample(
+            audio,
+            orig_sr=sample_rate,
+            target_sr=expected_sample_rate,
+        )
+    return audio
+
+
+def get_powerset_mapping(num_classes, num_speakers, powerset_max_classes):
+    mapping = np.zeros((num_classes, num_speakers))
+
+    k = 1
+    for i in range(1, powerset_max_classes + 1):
+        if i == 1:
+            for j in range(0, num_speakers):
+                mapping[k, j] = 1
+                k += 1
+        elif i == 2:
+            for j in range(0, num_speakers):
+                for m in range(j + 1, num_speakers):
+                    mapping[k, j] = 1
+                    mapping[k, m] = 1
+                    k += 1
+        elif i == 3:
+            raise RuntimeError("Unsupported")
+
+    return mapping
+
+
+def to_multi_label(y, mapping):
+    """
+    Args:
+      y: (num_chunks, num_frames, num_classes)
+    Returns:
+      A tensor of shape (num_chunks, num_frames, num_speakers)
+    """
+    y = np.argmax(y, axis=-1)
+    labels = mapping[y.reshape(-1)].reshape(y.shape[0], y.shape[1], -1)
+    return labels
+
+
+def main():
+    args = get_args()
+    assert Path(args.model).is_file(), args.model
+    assert Path(args.wav).is_file(), args.wav
+
+    m = OnnxModel(args.model)
+    audio = load_wav(args.wav, m.sample_rate)
+    # audio: (num_samples,)
+    print("audio", audio.shape, audio.min(), audio.max(), audio.sum())
+
+    num = (audio.shape[0] - m.window_size) // m.window_shift + 1
+
+    samples = as_strided(
+        audio,
+        shape=(num, m.window_size),
+        strides=(m.window_shift * audio.strides[0], audio.strides[0]),
+    )
+
+    # or use torch.Tensor.unfold
+    #  samples = torch.from_numpy(audio).unfold(0, m.window_size, m.window_shift).numpy()
+
+    print(
+        "samples",
+        samples.shape,
+        samples.mean(),
+        samples.sum(),
+        samples[:3, :3].sum(axis=-1),
+    )
+
+    if (
+        audio.shape[0] < m.window_size
+        or (audio.shape[0] - m.window_size) % m.window_shift > 0
+    ):
+        has_last_chunk = True
+    else:
+        has_last_chunk = False
+
+    num_chunks = samples.shape[0]
+    batch_size = 32
+    output = []
+    for i in range(0, num_chunks, batch_size):
+        start = i
+        end = i + batch_size
+        # it's perfectly ok to use end > num_chunks
+        y = m(samples[start:end])
+        output.append(y)
+
+    if has_last_chunk:
+        last_chunk = audio[num_chunks * m.window_shift :]  # noqa
+        pad_size = m.window_size - last_chunk.shape[0]
+        last_chunk = np.pad(last_chunk, (0, pad_size))
+        last_chunk = np.expand_dims(last_chunk, axis=0)
+        y = m(last_chunk)
+        output.append(y)
+
+    y = np.vstack(output)
+    # y: (num_chunks, num_frames, num_classes)
+
+    mapping = get_powerset_mapping(
+        num_classes=m.num_classes,
+        num_speakers=m.num_speakers,
+        powerset_max_classes=m.powerset_max_classes,
+    )
+    labels = to_multi_label(y, mapping=mapping)
+    # labels: (num_chunks, num_frames, num_speakers)
+
+    # binary classification
+    labels = np.max(labels, axis=-1)
+    # labels: (num_chunk, num_frames)
+
+    num_frames = (
+        int(
+            (m.window_size + (labels.shape[0] - 1) * m.window_shift)
+            / m.receptive_field_shift
+        )
+        + 1
+    )
+
+    count = np.zeros((num_frames,))
+    classification = np.zeros((num_frames,))
+    weight = np.hamming(labels.shape[1])
+
+    for i in range(labels.shape[0]):
+        this_chunk = labels[i]
+        start = int(i * m.window_shift / m.receptive_field_shift + 0.5)
+        end = start + this_chunk.shape[0]
+
+        classification[start:end] += this_chunk * weight
+        count[start:end] += weight
+
+    classification /= np.maximum(count, 1e-12)
+
+    if has_last_chunk:
+        stop_frame = int(audio.shape[0] / m.receptive_field_shift)
+        classification = classification[:stop_frame]
+
+    classification = classification.tolist()
+
+    onset = 0.5
+    offset = 0.5
+
+    is_active = classification[0] > onset
+    start = None
+
+    scale = m.receptive_field_shift / m.sample_rate
+    scale_offset = m.receptive_field_size / m.sample_rate * 0.5
+
+    for i in range(len(classification)):
+        if is_active:
+            if classification[i] < offset:
+                print(
+                    f"{start*scale + scale_offset:.3f} -- {i*scale + scale_offset:.3f}"
+                )
+                is_active = False
+        else:
+            if classification[i] > onset:
+                start = i
+                is_active = True
+
+    if is_active:
+        print(
+            f"{start*scale + scale_offset:.3f} -- {(len(classification)-1)*scale + scale_offset:.3f}"
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/pyannote/segmentation/vad-torch.py
+++ b/scripts/pyannote/segmentation/vad-torch.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+
+import torch
+from pyannote.audio import Model
+from pyannote.audio.pipelines import (
+    VoiceActivityDetection as VoiceActivityDetectionPipeline,
+)
+
+
+@torch.no_grad()
+def main():
+    # Please download it from
+    # https://huggingface.co/csukuangfj/pyannote-models/tree/main/segmentation-3.0
+    pt_filename = "./pytorch_model.bin"
+    model = Model.from_pretrained(pt_filename)
+    model.eval()
+
+    pipeline = VoiceActivityDetectionPipeline(segmentation=model)
+
+    # https://huggingface.co/pyannote/voice-activity-detection/blob/main/config.yaml
+    # https://github.com/pyannote/pyannote-audio/issues/1215
+    initial_params = {
+        "min_duration_on": 0.0,
+        "min_duration_off": 0.0,
+    }
+    pipeline.onset = 0.5
+    pipeline.offset = 0.5
+
+    pipeline.instantiate(initial_params)
+
+    # wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
+    t = pipeline("./lei-jun-test.wav")
+    print(type(t))
+    print(t)
+
+
+if __name__ == "__main__":
+    main()