diff --git a/.github/workflows/export-pyannote-segmentation-to-onnx.yaml b/.github/workflows/export-pyannote-segmentation-to-onnx.yaml new file mode 100644 index 00000000..300aca50 --- /dev/null +++ b/.github/workflows/export-pyannote-segmentation-to-onnx.yaml @@ -0,0 +1,86 @@ +name: export-pyannote-segmentation-to-onnx + +on: + workflow_dispatch: + +concurrency: + group: export-pyannote-segmentation-to-onnx-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-pyannote-segmentation-to-onnx: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export Pyannote segmentation models to ONNX + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest] + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install pyannote + shell: bash + run: | + pip install pyannote.audio onnx onnxruntime + + - name: Run + shell: bash + run: | + d=sherpa-onnx-pyannote-segmentation-3-0 + src=$PWD/$d + mkdir -p $src + + pushd scripts/pyannote/segmentation + ./run.sh + cp ./*.onnx $src/ + cp ./README.md $src/ + cp ./LICENSE $src/ + cp ./run.sh $src/ + cp ./*.py $src/ + + popd + ls -lh $d + tar cjfv $d.tar.bz2 $d + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.tar.bz2 + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: speaker-segmentation-models + + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + d=sherpa-onnx-pyannote-segmentation-3-0 + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + git clone https://huggingface.co/csukuangfj/$d huggingface + cp -v $d/* ./huggingface + cd huggingface + git lfs track "*.onnx" + git status + git add . + git status + git commit -m "add models" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d main diff --git a/scripts/pyannote/segmentation/.gitignore b/scripts/pyannote/segmentation/.gitignore new file mode 100644 index 00000000..cc235d3d --- /dev/null +++ b/scripts/pyannote/segmentation/.gitignore @@ -0,0 +1,2 @@ +*.bin +*.onnx diff --git a/scripts/pyannote/segmentation/export-onnx.py b/scripts/pyannote/segmentation/export-onnx.py new file mode 100755 index 00000000..5f6e79c7 --- /dev/null +++ b/scripts/pyannote/segmentation/export-onnx.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 + +from typing import Any, Dict + +import onnx +import torch +from onnxruntime.quantization import QuantType, quantize_dynamic +from pyannote.audio import Model +from pyannote.audio.core.task import Problem, Resolution + + +def add_meta_data(filename: str, meta_data: Dict[str, Any]): + """Add meta data to an ONNX model. It is changed in-place. + + Args: + filename: + Filename of the ONNX model to be changed. + meta_data: + Key-value pairs. + """ + model = onnx.load(filename) + + while len(model.metadata_props): + model.metadata_props.pop() + + for key, value in meta_data.items(): + meta = model.metadata_props.add() + meta.key = key + meta.value = str(value) + + onnx.save(model, filename) + + +@torch.no_grad() +def main(): + # You can download ./pytorch_model.bin from + # https://hf-mirror.com/csukuangfj/pyannote-models/tree/main/segmentation-3.0 + pt_filename = "./pytorch_model.bin" + model = Model.from_pretrained(pt_filename) + model.eval() + assert model.dimension == 7, model.dimension + print(model.specifications) + + assert ( + model.specifications.problem == Problem.MONO_LABEL_CLASSIFICATION + ), model.specifications.problem + + assert ( + model.specifications.resolution == Resolution.FRAME + ), model.specifications.resolution + + assert model.specifications.duration == 10.0, model.specifications.duration + + assert model.audio.sample_rate == 16000, model.audio.sample_rate + + # (batch, num_channels, num_samples) + assert list(model.example_input_array.shape) == [ + 1, + 1, + 16000 * 10, + ], model.example_input_array.shape + + example_output = model(model.example_input_array) + + # (batch, num_frames, num_classes) + assert list(example_output.shape) == [1, 589, 7], example_output.shape + + assert model.receptive_field.step == 0.016875, model.receptive_field.step + assert model.receptive_field.duration == 0.0619375, model.receptive_field.duration + assert model.receptive_field.step * 16000 == 270, model.receptive_field.step * 16000 + assert model.receptive_field.duration * 16000 == 991, ( + model.receptive_field.duration * 16000 + ) + + opset_version = 18 + + filename = "model.onnx" + torch.onnx.export( + model, + model.example_input_array, + filename, + opset_version=opset_version, + input_names=["x"], + output_names=["y"], + dynamic_axes={ + "x": {0: "N", 2: "T"}, + "y": {0: "N", 1: "T"}, + }, + ) + + sample_rate = model.audio.sample_rate + + window_size = int(model.specifications.duration) * 16000 + receptive_field_size = int(model.receptive_field.duration * 16000) + receptive_field_shift = int(model.receptive_field.step * 16000) + + meta_data = { + "num_speakers": len(model.specifications.classes), + "powerset_max_classes": model.specifications.powerset_max_classes, + "num_classes": model.dimension, + "sample_rate": sample_rate, + "window_size": window_size, + "receptive_field_size": receptive_field_size, + "receptive_field_shift": receptive_field_shift, + "model_type": "pyannote-segmentation-3.0", + "version": "1", + "model_author": "pyannote", + "maintainer": "k2-fsa", + "url_1": "https://huggingface.co/pyannote/segmentation-3.0", + "url_2": "https://huggingface.co/csukuangfj/pyannote-models/tree/main/segmentation-3.0", + "license": "https://huggingface.co/pyannote/segmentation-3.0/blob/main/LICENSE", + } + add_meta_data(filename=filename, meta_data=meta_data) + + print("Generate int8 quantization models") + + filename_int8 = "model.int8.onnx" + quantize_dynamic( + model_input=filename, + model_output=filename_int8, + weight_type=QuantType.QUInt8, + ) + + print(f"Saved to {filename} and {filename_int8}") + + +if __name__ == "__main__": + main() diff --git a/scripts/pyannote/segmentation/notes.md b/scripts/pyannote/segmentation/notes.md new file mode 100644 index 00000000..38bdd5bc --- /dev/null +++ b/scripts/pyannote/segmentation/notes.md @@ -0,0 +1,78 @@ + +# config.yaml + + +```yaml +task: + _target_: pyannote.audio.tasks.SpeakerDiarization + duration: 10.0 + max_speakers_per_chunk: 3 + max_speakers_per_frame: 2 +model: + _target_: pyannote.audio.models.segmentation.PyanNet + sample_rate: 16000 + num_channels: 1 + sincnet: + stride: 10 + lstm: + hidden_size: 128 + num_layers: 4 + bidirectional: true + monolithic: true + linear: + hidden_size: 128 + num_layers: 2 +``` + +# Model architecture of ./pytorch_model.bin + +`print(model)`: + +```python3 +PyanNet( + (sincnet): SincNet( + (wav_norm1d): InstanceNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False) + (conv1d): ModuleList( + (0): Encoder( + (filterbank): ParamSincFB() + ) + (1): Conv1d(80, 60, kernel_size=(5,), stride=(1,)) + (2): Conv1d(60, 60, kernel_size=(5,), stride=(1,)) + ) + (pool1d): ModuleList( + (0-2): 3 x MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False) + ) + (norm1d): ModuleList( + (0): InstanceNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False) + (1-2): 2 x InstanceNorm1d(60, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False) + ) + ) + (lstm): LSTM(60, 128, num_layers=4, batch_first=True, dropout=0.5, bidirectional=True) + (linear): ModuleList( + (0): Linear(in_features=256, out_features=128, bias=True) + (1): Linear(in_features=128, out_features=128, bias=True) + ) + (classifier): Linear(in_features=128, out_features=7, bias=True) + (activation): LogSoftmax(dim=-1) +) +``` + +```python3 +>>> list(model.specifications) +[Specifications(problem=, resolution=, duration=10.0, min_duration=None, warm_up=(0.0, 0.0), classes=['speaker#1', 'speaker#2', 'speaker#3'], powerset_max_classes=2, permutation_invariant=True)] +``` + +```python3 +>>> model.hparams +"linear": {'hidden_size': 128, 'num_layers': 2} +"lstm": {'hidden_size': 128, 'num_layers': 4, 'bidirectional': True, 'monolithic': True, 'dropout': 0.5, 'batch_first': True} +"num_channels": 1 +"sample_rate": 16000 +"sincnet": {'stride': 10, 'sample_rate': 16000} +``` + +## Papers + +- [pyannote.audio 2.1 speaker diarization pipeline: principle, benchmark, and recipe](https://hal.science/hal-04247212/document) +- [pyannote.audio speaker diarization pipeline at VoxSRC 2023](https://mmai.io/datasets/voxceleb/voxsrc/data_workshop_2023/reports/pyannote_report.pdf) + diff --git a/scripts/pyannote/segmentation/preprocess.sh b/scripts/pyannote/segmentation/preprocess.sh new file mode 100755 index 00000000..703420b1 --- /dev/null +++ b/scripts/pyannote/segmentation/preprocess.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + + +python3 -m onnxruntime.quantization.preprocess --input model.onnx --output tmp.preprocessed.onnx +mv ./tmp.preprocessed.onnx ./model.onnx +./show-onnx.py --filename ./model.onnx + +<README.md << EOF +# Introduction + +Models in this file are converted from +https://huggingface.co/pyannote/segmentation-3.0/tree/main + +EOF + +cat >LICENSE < np.ndarray: + audio, sample_rate = sf.read(filename, dtype="float32", always_2d=True) + audio = audio[:, 0] # only use the first channel + if sample_rate != expected_sample_rate: + audio = librosa.resample( + audio, + orig_sr=sample_rate, + target_sr=expected_sample_rate, + ) + return audio + + +def get_powerset_mapping(num_classes, num_speakers, powerset_max_classes): + mapping = np.zeros((num_classes, num_speakers)) + + k = 1 + for i in range(1, powerset_max_classes + 1): + if i == 1: + for j in range(0, num_speakers): + mapping[k, j] = 1 + k += 1 + elif i == 2: + for j in range(0, num_speakers): + for m in range(j + 1, num_speakers): + mapping[k, j] = 1 + mapping[k, m] = 1 + k += 1 + elif i == 3: + raise RuntimeError("Unsupported") + + return mapping + + +def to_multi_label(y, mapping): + """ + Args: + y: (num_chunks, num_frames, num_classes) + Returns: + A tensor of shape (num_chunks, num_frames, num_speakers) + """ + y = np.argmax(y, axis=-1) + labels = mapping[y.reshape(-1)].reshape(y.shape[0], y.shape[1], -1) + return labels + + +def main(): + args = get_args() + assert Path(args.model).is_file(), args.model + assert Path(args.wav).is_file(), args.wav + + m = OnnxModel(args.model) + audio = load_wav(args.wav, m.sample_rate) + # audio: (num_samples,) + print("audio", audio.shape, audio.min(), audio.max(), audio.sum()) + + num = (audio.shape[0] - m.window_size) // m.window_shift + 1 + + samples = as_strided( + audio, + shape=(num, m.window_size), + strides=(m.window_shift * audio.strides[0], audio.strides[0]), + ) + + # or use torch.Tensor.unfold + # samples = torch.from_numpy(audio).unfold(0, m.window_size, m.window_shift).numpy() + + print( + "samples", + samples.shape, + samples.mean(), + samples.sum(), + samples[:3, :3].sum(axis=-1), + ) + + if ( + audio.shape[0] < m.window_size + or (audio.shape[0] - m.window_size) % m.window_shift > 0 + ): + has_last_chunk = True + else: + has_last_chunk = False + + num_chunks = samples.shape[0] + batch_size = 32 + output = [] + for i in range(0, num_chunks, batch_size): + start = i + end = i + batch_size + # it's perfectly ok to use end > num_chunks + y = m(samples[start:end]) + output.append(y) + + if has_last_chunk: + last_chunk = audio[num_chunks * m.window_shift :] # noqa + pad_size = m.window_size - last_chunk.shape[0] + last_chunk = np.pad(last_chunk, (0, pad_size)) + last_chunk = np.expand_dims(last_chunk, axis=0) + y = m(last_chunk) + output.append(y) + + y = np.vstack(output) + # y: (num_chunks, num_frames, num_classes) + + mapping = get_powerset_mapping( + num_classes=m.num_classes, + num_speakers=m.num_speakers, + powerset_max_classes=m.powerset_max_classes, + ) + labels = to_multi_label(y, mapping=mapping) + # labels: (num_chunks, num_frames, num_speakers) + + # binary classification + labels = np.max(labels, axis=-1) + # labels: (num_chunk, num_frames) + + num_frames = ( + int( + (m.window_size + (labels.shape[0] - 1) * m.window_shift) + / m.receptive_field_shift + ) + + 1 + ) + + count = np.zeros((num_frames,)) + classification = np.zeros((num_frames,)) + weight = np.hamming(labels.shape[1]) + + for i in range(labels.shape[0]): + this_chunk = labels[i] + start = int(i * m.window_shift / m.receptive_field_shift + 0.5) + end = start + this_chunk.shape[0] + + classification[start:end] += this_chunk * weight + count[start:end] += weight + + classification /= np.maximum(count, 1e-12) + + if has_last_chunk: + stop_frame = int(audio.shape[0] / m.receptive_field_shift) + classification = classification[:stop_frame] + + classification = classification.tolist() + + onset = 0.5 + offset = 0.5 + + is_active = classification[0] > onset + start = None + + scale = m.receptive_field_shift / m.sample_rate + scale_offset = m.receptive_field_size / m.sample_rate * 0.5 + + for i in range(len(classification)): + if is_active: + if classification[i] < offset: + print( + f"{start*scale + scale_offset:.3f} -- {i*scale + scale_offset:.3f}" + ) + is_active = False + else: + if classification[i] > onset: + start = i + is_active = True + + if is_active: + print( + f"{start*scale + scale_offset:.3f} -- {(len(classification)-1)*scale + scale_offset:.3f}" + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/pyannote/segmentation/vad-torch.py b/scripts/pyannote/segmentation/vad-torch.py new file mode 100755 index 00000000..cc4e1108 --- /dev/null +++ b/scripts/pyannote/segmentation/vad-torch.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 + +import torch +from pyannote.audio import Model +from pyannote.audio.pipelines import ( + VoiceActivityDetection as VoiceActivityDetectionPipeline, +) + + +@torch.no_grad() +def main(): + # Please download it from + # https://huggingface.co/csukuangfj/pyannote-models/tree/main/segmentation-3.0 + pt_filename = "./pytorch_model.bin" + model = Model.from_pretrained(pt_filename) + model.eval() + + pipeline = VoiceActivityDetectionPipeline(segmentation=model) + + # https://huggingface.co/pyannote/voice-activity-detection/blob/main/config.yaml + # https://github.com/pyannote/pyannote-audio/issues/1215 + initial_params = { + "min_duration_on": 0.0, + "min_duration_off": 0.0, + } + pipeline.onset = 0.5 + pipeline.offset = 0.5 + + pipeline.instantiate(initial_params) + + # wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav + t = pipeline("./lei-jun-test.wav") + print(type(t)) + print(t) + + +if __name__ == "__main__": + main()