Add Python API for source separation (#2283)
This commit is contained in:
26
.github/scripts/test-python.sh
vendored
26
.github/scripts/test-python.sh
vendored
@@ -8,6 +8,32 @@ log() {
|
|||||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log "test spleeter"
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/sherpa-onnx-spleeter-2stems-fp16.tar.bz2
|
||||||
|
tar xvf sherpa-onnx-spleeter-2stems-fp16.tar.bz2
|
||||||
|
rm sherpa-onnx-spleeter-2stems-fp16.tar.bz2
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/qi-feng-le-zh.wav
|
||||||
|
./python-api-examples/offline-source-separation-spleeter.py
|
||||||
|
rm -rf sherpa-onnx-spleeter-2stems-fp16
|
||||||
|
rm qi-feng-le-zh.wav
|
||||||
|
|
||||||
|
log "test UVR"
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/UVR_MDXNET_9482.onnx
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/qi-feng-le-zh.wav
|
||||||
|
./python-api-examples/offline-source-separation-uvr.py
|
||||||
|
rm UVR_MDXNET_9482.onnx
|
||||||
|
rm qi-feng-le-zh.wav
|
||||||
|
|
||||||
|
mkdir source-separation
|
||||||
|
|
||||||
|
mv spleeter-*.wav source-separation
|
||||||
|
mv uvr-*.wav source-separation
|
||||||
|
|
||||||
|
ls -lh source-separation
|
||||||
|
|
||||||
|
|
||||||
log "test offline dolphin ctc"
|
log "test offline dolphin ctc"
|
||||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
|
||||||
tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
|
tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
|
||||||
|
|||||||
5
.github/workflows/run-python-test-macos.yaml
vendored
5
.github/workflows/run-python-test-macos.yaml
vendored
@@ -97,6 +97,11 @@ jobs:
|
|||||||
.github/scripts/test-python.sh
|
.github/scripts/test-python.sh
|
||||||
.github/scripts/test-speaker-recognition-python.sh
|
.github/scripts/test-speaker-recognition-python.sh
|
||||||
|
|
||||||
|
- uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: source-separation-${{ matrix.os }}-${{ matrix.python-version }}
|
||||||
|
path: ./source-separation
|
||||||
|
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: tts-generated-test-files-${{ matrix.os }}-${{ matrix.python-version }}
|
name: tts-generated-test-files-${{ matrix.os }}-${{ matrix.python-version }}
|
||||||
|
|||||||
35
.github/workflows/run-python-test.yaml
vendored
35
.github/workflows/run-python-test.yaml
vendored
@@ -36,22 +36,18 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
# it fails to install ffmpeg on ubuntu 20.04
|
- os: ubuntu-24.04
|
||||||
#
|
python-version: "3.8"
|
||||||
# - os: ubuntu-20.04
|
- os: ubuntu-24.04
|
||||||
# python-version: "3.7"
|
python-version: "3.9"
|
||||||
# - os: ubuntu-20.04
|
|
||||||
# python-version: "3.8"
|
|
||||||
# - os: ubuntu-20.04
|
|
||||||
# python-version: "3.9"
|
|
||||||
|
|
||||||
- os: ubuntu-22.04
|
- os: ubuntu-24.04
|
||||||
python-version: "3.10"
|
python-version: "3.10"
|
||||||
- os: ubuntu-22.04
|
- os: ubuntu-24.04
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
- os: ubuntu-22.04
|
- os: ubuntu-24.04
|
||||||
python-version: "3.12"
|
python-version: "3.12"
|
||||||
- os: ubuntu-22.04
|
- os: ubuntu-24.04
|
||||||
python-version: "3.13"
|
python-version: "3.13"
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
@@ -81,10 +77,12 @@ jobs:
|
|||||||
python3 -m pip install --upgrade pip numpy pypinyin sentencepiece>=0.1.96 soundfile
|
python3 -m pip install --upgrade pip numpy pypinyin sentencepiece>=0.1.96 soundfile
|
||||||
python3 -m pip install wheel twine setuptools
|
python3 -m pip install wheel twine setuptools
|
||||||
|
|
||||||
- name: Install ffmpeg
|
- uses: afoley587/setup-ffmpeg@main
|
||||||
shell: bash
|
id: setup-ffmpeg
|
||||||
run: |
|
with:
|
||||||
sudo apt-get install ffmpeg
|
ffmpeg-version: release
|
||||||
|
architecture: ''
|
||||||
|
github-token: ${{ github.server_url == 'https://github.com' && github.token || '' }}
|
||||||
|
|
||||||
- name: Install ninja
|
- name: Install ninja
|
||||||
shell: bash
|
shell: bash
|
||||||
@@ -189,6 +187,11 @@ jobs:
|
|||||||
.github/scripts/test-python.sh
|
.github/scripts/test-python.sh
|
||||||
.github/scripts/test-speaker-recognition-python.sh
|
.github/scripts/test-speaker-recognition-python.sh
|
||||||
|
|
||||||
|
- uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: source-separation-${{ matrix.os }}-${{ matrix.python-version }}-whl
|
||||||
|
path: ./source-separation
|
||||||
|
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: tts-generated-test-files-${{ matrix.os }}-${{ matrix.python-version }}
|
name: tts-generated-test-files-${{ matrix.os }}-${{ matrix.python-version }}
|
||||||
|
|||||||
122
python-api-examples/offline-source-separation-spleeter.py
Executable file
122
python-api-examples/offline-source-separation-spleeter.py
Executable file
@@ -0,0 +1,122 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright (c) 2025 Xiaomi Corporation
|
||||||
|
|
||||||
|
"""
|
||||||
|
This file shows how to use spleeter for source separation.
|
||||||
|
|
||||||
|
Please first download a spleeter model from
|
||||||
|
|
||||||
|
https://github.com/k2-fsa/sherpa-onnx/releases/tag/source-separation-models
|
||||||
|
|
||||||
|
The following is an example:
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/sherpa-onnx-spleeter-2stems-fp16.tar.bz2
|
||||||
|
|
||||||
|
Please also download a test file
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/qi-feng-le-zh.wav
|
||||||
|
|
||||||
|
The test wav file is 16-bit encoded with 2 channels. If you have other
|
||||||
|
formats, e.g., .mp4 or .mp3, please first use ffmpeg to convert it.
|
||||||
|
For instance
|
||||||
|
|
||||||
|
ffmpeg -i your.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 out.wav
|
||||||
|
|
||||||
|
Then you can use out.wav as input for this example.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import sherpa_onnx
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
|
||||||
|
def create_offline_source_separation():
|
||||||
|
# Please read the help message at the beginning of this file
|
||||||
|
# to download model files
|
||||||
|
vocals = "./sherpa-onnx-spleeter-2stems-fp16/vocals.fp16.onnx"
|
||||||
|
accompaniment = "./sherpa-onnx-spleeter-2stems-fp16/accompaniment.fp16.onnx"
|
||||||
|
|
||||||
|
if not Path(vocals).is_file():
|
||||||
|
raise ValueError(f"{vocals} does not exist.")
|
||||||
|
|
||||||
|
if not Path(accompaniment).is_file():
|
||||||
|
raise ValueError(f"{accompaniment} does not exist.")
|
||||||
|
|
||||||
|
config = sherpa_onnx.OfflineSourceSeparationConfig(
|
||||||
|
model=sherpa_onnx.OfflineSourceSeparationModelConfig(
|
||||||
|
spleeter=sherpa_onnx.OfflineSourceSeparationSpleeterModelConfig(
|
||||||
|
vocals=vocals,
|
||||||
|
accompaniment=accompaniment,
|
||||||
|
),
|
||||||
|
num_threads=1,
|
||||||
|
debug=False,
|
||||||
|
provider="cpu",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if not config.validate():
|
||||||
|
raise ValueError("Please check your config.")
|
||||||
|
|
||||||
|
return sherpa_onnx.OfflineSourceSeparation(config)
|
||||||
|
|
||||||
|
|
||||||
|
def load_audio():
|
||||||
|
# Please read the help message at the beginning of this file to download
|
||||||
|
# the following wav_file
|
||||||
|
wav_file = "./qi-feng-le-zh.wav"
|
||||||
|
if not Path(wav_file).is_file():
|
||||||
|
raise ValueError(f"{wav_file} does not exist")
|
||||||
|
|
||||||
|
samples, sample_rate = sf.read(wav_file, dtype="float32", always_2d=True)
|
||||||
|
samples = np.transpose(samples)
|
||||||
|
# now samples is of shape (num_channels, num_samples)
|
||||||
|
assert (
|
||||||
|
samples.shape[1] > samples.shape[0]
|
||||||
|
), f"You should use (num_channels, num_samples). {samples.shape}"
|
||||||
|
|
||||||
|
assert (
|
||||||
|
samples.dtype == np.float32
|
||||||
|
), f"Expect np.float32 as dtype. Given: {samples.dtype}"
|
||||||
|
|
||||||
|
return samples, sample_rate
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
sp = create_offline_source_separation()
|
||||||
|
samples, sample_rate = load_audio()
|
||||||
|
samples = np.ascontiguousarray(samples)
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
output = sp.process(sample_rate=sample_rate, samples=samples)
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
print("output.sample_rate", output.sample_rate)
|
||||||
|
|
||||||
|
assert len(output.stems) == 2, len(output.stems)
|
||||||
|
|
||||||
|
vocals = output.stems[0].data
|
||||||
|
non_vocals = output.stems[1].data
|
||||||
|
# vocals.shape (num_channels, num_samples)
|
||||||
|
|
||||||
|
vocals = np.transpose(vocals)
|
||||||
|
non_vocals = np.transpose(non_vocals)
|
||||||
|
|
||||||
|
# vocals.shape (num_samples,num_channels)
|
||||||
|
|
||||||
|
sf.write("./spleeter-vocals.wav", vocals, samplerate=output.sample_rate)
|
||||||
|
sf.write("./spleeter-non-vocals.wav", non_vocals, samplerate=output.sample_rate)
|
||||||
|
|
||||||
|
elapsed_seconds = end - start
|
||||||
|
audio_duration = samples.shape[1] / sample_rate
|
||||||
|
real_time_factor = elapsed_seconds / audio_duration
|
||||||
|
|
||||||
|
print("Saved to ./spleeter-vocals.wav and ./spleeter-non-vocals.wav")
|
||||||
|
print(f"Elapsed seconds: {elapsed_seconds:.3f}")
|
||||||
|
print(f"Audio duration in seconds: {audio_duration:.3f}")
|
||||||
|
print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
118
python-api-examples/offline-source-separation-uvr.py
Executable file
118
python-api-examples/offline-source-separation-uvr.py
Executable file
@@ -0,0 +1,118 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright (c) 2025 Xiaomi Corporation
|
||||||
|
|
||||||
|
"""
|
||||||
|
This file shows how to use UVR for source separation.
|
||||||
|
|
||||||
|
Please first download a UVR model from
|
||||||
|
|
||||||
|
https://github.com/k2-fsa/sherpa-onnx/releases/tag/source-separation-models
|
||||||
|
|
||||||
|
The following is an example:
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/UVR_MDXNET_9482.onnx
|
||||||
|
|
||||||
|
Please also download a test file
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/qi-feng-le-zh.wav
|
||||||
|
|
||||||
|
The test wav file is 16-bit encoded with 2 channels. If you have other
|
||||||
|
formats, e.g., .mp4 or .mp3, please first use ffmpeg to convert it.
|
||||||
|
For instance
|
||||||
|
|
||||||
|
ffmpeg -i your.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 out.wav
|
||||||
|
|
||||||
|
Then you can use out.wav as input for this example.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import sherpa_onnx
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
|
||||||
|
def create_offline_source_separation():
|
||||||
|
# Please read the help message at the beginning of this file
|
||||||
|
# to download model files
|
||||||
|
model = "./UVR_MDXNET_9482.onnx"
|
||||||
|
|
||||||
|
if not Path(model).is_file():
|
||||||
|
raise ValueError(f"{model} does not exist.")
|
||||||
|
|
||||||
|
config = sherpa_onnx.OfflineSourceSeparationConfig(
|
||||||
|
model=sherpa_onnx.OfflineSourceSeparationModelConfig(
|
||||||
|
uvr=sherpa_onnx.OfflineSourceSeparationUvrModelConfig(
|
||||||
|
model=model,
|
||||||
|
),
|
||||||
|
num_threads=1,
|
||||||
|
debug=False,
|
||||||
|
provider="cpu",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if not config.validate():
|
||||||
|
raise ValueError("Please check your config.")
|
||||||
|
|
||||||
|
return sherpa_onnx.OfflineSourceSeparation(config)
|
||||||
|
|
||||||
|
|
||||||
|
def load_audio():
|
||||||
|
# Please read the help message at the beginning of this file to download
|
||||||
|
# the following wav_file
|
||||||
|
wav_file = "./qi-feng-le-zh.wav"
|
||||||
|
if not Path(wav_file).is_file():
|
||||||
|
raise ValueError(f"{wav_file} does not exist")
|
||||||
|
|
||||||
|
samples, sample_rate = sf.read(wav_file, dtype="float32", always_2d=True)
|
||||||
|
samples = np.transpose(samples)
|
||||||
|
# now samples is of shape (num_channels, num_samples)
|
||||||
|
assert (
|
||||||
|
samples.shape[1] > samples.shape[0]
|
||||||
|
), f"You should use (num_channels, num_samples). {samples.shape}"
|
||||||
|
|
||||||
|
assert (
|
||||||
|
samples.dtype == np.float32
|
||||||
|
), f"Expect np.float32 as dtype. Given: {samples.dtype}"
|
||||||
|
|
||||||
|
return samples, sample_rate
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
sp = create_offline_source_separation()
|
||||||
|
samples, sample_rate = load_audio()
|
||||||
|
samples = np.ascontiguousarray(samples)
|
||||||
|
|
||||||
|
print("Started. Please wait")
|
||||||
|
start = time.time()
|
||||||
|
output = sp.process(sample_rate=sample_rate, samples=samples)
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
print("output.sample_rate", output.sample_rate)
|
||||||
|
|
||||||
|
assert len(output.stems) == 2, len(output.stems)
|
||||||
|
|
||||||
|
vocals = output.stems[0].data
|
||||||
|
non_vocals = output.stems[1].data
|
||||||
|
# vocals.shape (num_channels, num_samples)
|
||||||
|
|
||||||
|
vocals = np.transpose(vocals)
|
||||||
|
non_vocals = np.transpose(non_vocals)
|
||||||
|
|
||||||
|
# vocals.shape (num_samples,num_channels)
|
||||||
|
|
||||||
|
sf.write("./uvr-vocals.wav", vocals, samplerate=output.sample_rate)
|
||||||
|
sf.write("./uvr-non-vocals.wav", non_vocals, samplerate=output.sample_rate)
|
||||||
|
|
||||||
|
elapsed_seconds = end - start
|
||||||
|
audio_duration = samples.shape[1] / sample_rate
|
||||||
|
real_time_factor = elapsed_seconds / audio_duration
|
||||||
|
|
||||||
|
print("Saved to ./uvr-vocals.wav and ./uvr-non-vocals.wav")
|
||||||
|
print(f"Elapsed seconds: {elapsed_seconds:.3f}")
|
||||||
|
print(f"Audio duration in seconds: {audio_duration:.3f}")
|
||||||
|
print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -20,6 +20,10 @@ set(srcs
|
|||||||
offline-punctuation.cc
|
offline-punctuation.cc
|
||||||
offline-recognizer.cc
|
offline-recognizer.cc
|
||||||
offline-sense-voice-model-config.cc
|
offline-sense-voice-model-config.cc
|
||||||
|
offline-source-separation-model-config.cc
|
||||||
|
offline-source-separation-spleeter-model-config.cc
|
||||||
|
offline-source-separation-uvr-model-config.cc
|
||||||
|
offline-source-separation.cc
|
||||||
offline-speech-denoiser-gtcrn-model-config.cc
|
offline-speech-denoiser-gtcrn-model-config.cc
|
||||||
offline-speech-denoiser-model-config.cc
|
offline-speech-denoiser-model-config.cc
|
||||||
offline-speech-denoiser.cc
|
offline-speech-denoiser.cc
|
||||||
|
|||||||
@@ -9,6 +9,8 @@
|
|||||||
|
|
||||||
#include "sherpa-onnx/csrc/fast-clustering.h"
|
#include "sherpa-onnx/csrc/fast-clustering.h"
|
||||||
|
|
||||||
|
#define C_CONTIGUOUS py::detail::npy_api::constants::NPY_ARRAY_C_CONTIGUOUS_
|
||||||
|
|
||||||
namespace sherpa_onnx {
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
static void PybindFastClusteringConfig(py::module *m) {
|
static void PybindFastClusteringConfig(py::module *m) {
|
||||||
@@ -32,6 +34,12 @@ void PybindFastClustering(py::module *m) {
|
|||||||
"__call__",
|
"__call__",
|
||||||
[](const PyClass &self,
|
[](const PyClass &self,
|
||||||
py::array_t<float> features) -> std::vector<int32_t> {
|
py::array_t<float> features) -> std::vector<int32_t> {
|
||||||
|
if (!(C_CONTIGUOUS == (features.flags() & C_CONTIGUOUS))) {
|
||||||
|
throw py::value_error(
|
||||||
|
"input features should be contiguous. Please use "
|
||||||
|
"np.ascontiguousarray(features)");
|
||||||
|
}
|
||||||
|
|
||||||
int num_dim = features.ndim();
|
int num_dim = features.ndim();
|
||||||
if (num_dim != 2) {
|
if (num_dim != 2) {
|
||||||
std::ostringstream os;
|
std::ostringstream os;
|
||||||
|
|||||||
@@ -59,14 +59,14 @@ void PybindOfflineRecognizer(py::module *m) {
|
|||||||
return self.CreateStream(hotwords);
|
return self.CreateStream(hotwords);
|
||||||
},
|
},
|
||||||
py::arg("hotwords"), py::call_guard<py::gil_scoped_release>())
|
py::arg("hotwords"), py::call_guard<py::gil_scoped_release>())
|
||||||
.def("decode_stream", &PyClass::DecodeStream,
|
.def("decode_stream", &PyClass::DecodeStream, py::arg("s"),
|
||||||
py::call_guard<py::gil_scoped_release>())
|
py::call_guard<py::gil_scoped_release>())
|
||||||
.def(
|
.def(
|
||||||
"decode_streams",
|
"decode_streams",
|
||||||
[](const PyClass &self, std::vector<OfflineStream *> ss) {
|
[](const PyClass &self, std::vector<OfflineStream *> ss) {
|
||||||
self.DecodeStreams(ss.data(), ss.size());
|
self.DecodeStreams(ss.data(), ss.size());
|
||||||
},
|
},
|
||||||
py::call_guard<py::gil_scoped_release>());
|
py::arg("ss"), py::call_guard<py::gil_scoped_release>());
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace sherpa_onnx
|
} // namespace sherpa_onnx
|
||||||
|
|||||||
@@ -0,0 +1,37 @@
|
|||||||
|
// sherpa-onnx/python/csrc/offline-source-separation-model-config.cc
|
||||||
|
//
|
||||||
|
// Copyright (c) 2025 Xiaomi Corporation
|
||||||
|
|
||||||
|
#include "sherpa-onnx/python/csrc/offline-source-separation-model-config.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "sherpa-onnx/csrc/offline-source-separation-model-config.h"
|
||||||
|
#include "sherpa-onnx/python/csrc/offline-source-separation-spleeter-model-config.h"
|
||||||
|
#include "sherpa-onnx/python/csrc/offline-source-separation-uvr-model-config.h"
|
||||||
|
|
||||||
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
void PybindOfflineSourceSeparationModelConfig(py::module *m) {
|
||||||
|
PybindOfflineSourceSeparationSpleeterModelConfig(m);
|
||||||
|
PybindOfflineSourceSeparationUvrModelConfig(m);
|
||||||
|
|
||||||
|
using PyClass = OfflineSourceSeparationModelConfig;
|
||||||
|
py::class_<PyClass>(*m, "OfflineSourceSeparationModelConfig")
|
||||||
|
.def(py::init<const OfflineSourceSeparationSpleeterModelConfig &,
|
||||||
|
const OfflineSourceSeparationUvrModelConfig &, int32_t,
|
||||||
|
bool, const std::string &>(),
|
||||||
|
py::arg("spleeter") = OfflineSourceSeparationSpleeterModelConfig{},
|
||||||
|
py::arg("uvr") = OfflineSourceSeparationUvrModelConfig{},
|
||||||
|
py::arg("num_threads") = 1, py::arg("debug") = false,
|
||||||
|
py::arg("provider") = "cpu")
|
||||||
|
.def_readwrite("spleeter", &PyClass::spleeter)
|
||||||
|
.def_readwrite("uvr", &PyClass::uvr)
|
||||||
|
.def_readwrite("num_threads", &PyClass::num_threads)
|
||||||
|
.def_readwrite("debug", &PyClass::debug)
|
||||||
|
.def_readwrite("provider", &PyClass::provider)
|
||||||
|
.def("validate", &PyClass::Validate)
|
||||||
|
.def("__str__", &PyClass::ToString);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace sherpa_onnx
|
||||||
@@ -0,0 +1,16 @@
|
|||||||
|
// sherpa-onnx/python/csrc/offline-source-separation-model-config.h
|
||||||
|
//
|
||||||
|
// Copyright (c) 2025 Xiaomi Corporation
|
||||||
|
|
||||||
|
#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_SOURCE_SEPARATION_MODEL_CONFIG_H_
|
||||||
|
#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_SOURCE_SEPARATION_MODEL_CONFIG_H_
|
||||||
|
|
||||||
|
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
|
||||||
|
|
||||||
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
void PybindOfflineSourceSeparationModelConfig(py::module *m);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_SOURCE_SEPARATION_MODEL_CONFIG_H_
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
// sherpa-onnx/python/csrc/offline-source-separation-spleeter-model-config.cc
|
||||||
|
//
|
||||||
|
// Copyright (c) 2025 Xiaomi Corporation
|
||||||
|
|
||||||
|
#include "sherpa-onnx/python/csrc/offline-source-separation-spleeter-model-config.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "sherpa-onnx/csrc/offline-source-separation-spleeter-model-config.h"
|
||||||
|
|
||||||
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
void PybindOfflineSourceSeparationSpleeterModelConfig(py::module *m) {
|
||||||
|
using PyClass = OfflineSourceSeparationSpleeterModelConfig;
|
||||||
|
py::class_<PyClass>(*m, "OfflineSourceSeparationSpleeterModelConfig")
|
||||||
|
.def(py::init<const std::string &, const std::string &>(),
|
||||||
|
py::arg("vocals") = "", py::arg("accompaniment") = "")
|
||||||
|
.def_readwrite("vocals", &PyClass::vocals)
|
||||||
|
.def_readwrite("accompaniment", &PyClass::accompaniment)
|
||||||
|
.def("validate", &PyClass::Validate)
|
||||||
|
.def("__str__", &PyClass::ToString);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace sherpa_onnx
|
||||||
@@ -0,0 +1,16 @@
|
|||||||
|
// sherpa-onnx/python/csrc/offline-source-separation-spleeter-model-config.h
|
||||||
|
//
|
||||||
|
// Copyright (c) 2025 Xiaomi Corporation
|
||||||
|
|
||||||
|
#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_SOURCE_SEPARATION_SPLEETER_MODEL_CONFIG_H_
|
||||||
|
#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_SOURCE_SEPARATION_SPLEETER_MODEL_CONFIG_H_
|
||||||
|
|
||||||
|
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
|
||||||
|
|
||||||
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
void PybindOfflineSourceSeparationSpleeterModelConfig(py::module *m);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_SOURCE_SEPARATION_SPLEETER_MODEL_CONFIG_H_
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
// sherpa-onnx/python/csrc/offline-source-separation-uvr-model-config.cc
|
||||||
|
//
|
||||||
|
// Copyright (c) 2025 Xiaomi Corporation
|
||||||
|
|
||||||
|
#include "sherpa-onnx/python/csrc/offline-source-separation-uvr-model-config.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "sherpa-onnx/csrc/offline-source-separation-uvr-model-config.h"
|
||||||
|
|
||||||
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
void PybindOfflineSourceSeparationUvrModelConfig(py::module *m) {
|
||||||
|
using PyClass = OfflineSourceSeparationUvrModelConfig;
|
||||||
|
py::class_<PyClass>(*m, "OfflineSourceSeparationUvrModelConfig")
|
||||||
|
.def(py::init<const std::string &>(), py::arg("model") = "")
|
||||||
|
.def_readwrite("model", &PyClass::model)
|
||||||
|
.def("validate", &PyClass::Validate)
|
||||||
|
.def("__str__", &PyClass::ToString);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace sherpa_onnx
|
||||||
@@ -0,0 +1,16 @@
|
|||||||
|
// sherpa-onnx/python/csrc/offline-source-separation-uvr-model-config.h
|
||||||
|
//
|
||||||
|
// Copyright (c) 2025 Xiaomi Corporation
|
||||||
|
|
||||||
|
#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_CONFIG_H_
|
||||||
|
#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_CONFIG_H_
|
||||||
|
|
||||||
|
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
|
||||||
|
|
||||||
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
void PybindOfflineSourceSeparationUvrModelConfig(py::module *m);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_CONFIG_H_
|
||||||
133
sherpa-onnx/python/csrc/offline-source-separation.cc
Normal file
133
sherpa-onnx/python/csrc/offline-source-separation.cc
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
// sherpa-onnx/python/csrc/offline-source-separation-config.cc
|
||||||
|
//
|
||||||
|
// Copyright (c) 2025 Xiaomi Corporation
|
||||||
|
|
||||||
|
#include "sherpa-onnx/csrc/offline-source-separation.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "sherpa-onnx/python/csrc/offline-source-separation-model-config.h"
|
||||||
|
#include "sherpa-onnx/python/csrc/offline-source-separation.h"
|
||||||
|
|
||||||
|
#define C_CONTIGUOUS py::detail::npy_api::constants::NPY_ARRAY_C_CONTIGUOUS_
|
||||||
|
|
||||||
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
static void PybindOfflineSourceSeparationConfig(py::module *m) {
|
||||||
|
PybindOfflineSourceSeparationModelConfig(m);
|
||||||
|
|
||||||
|
using PyClass = OfflineSourceSeparationConfig;
|
||||||
|
py::class_<PyClass>(*m, "OfflineSourceSeparationConfig")
|
||||||
|
.def(py::init<const OfflineSourceSeparationModelConfig &>(),
|
||||||
|
py::arg("model") = OfflineSourceSeparationModelConfig{})
|
||||||
|
.def_readwrite("model", &PyClass::model)
|
||||||
|
.def("validate", &PyClass::Validate)
|
||||||
|
.def("__str__", &PyClass::ToString);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void PybindMultiChannelSamples(py::module *m) {
|
||||||
|
using PyClass = MultiChannelSamples;
|
||||||
|
|
||||||
|
py::class_<PyClass>(*m, "MultiChannelSamples")
|
||||||
|
.def_property_readonly("data", [](PyClass &self) -> py::object {
|
||||||
|
// if data is not empty, return a float array of
|
||||||
|
// shape (num_channels, num_samples)
|
||||||
|
int32_t num_channels = self.data.size();
|
||||||
|
if (num_channels == 0) {
|
||||||
|
return py::none();
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t num_samples = self.data[0].size();
|
||||||
|
if (num_samples == 0) {
|
||||||
|
return py::none();
|
||||||
|
}
|
||||||
|
|
||||||
|
py::array_t<float> ans({num_channels, num_samples});
|
||||||
|
|
||||||
|
py::buffer_info buf = ans.request();
|
||||||
|
auto p = static_cast<float *>(buf.ptr);
|
||||||
|
|
||||||
|
for (int32_t i = 0; i != num_channels; ++i) {
|
||||||
|
std::copy(self.data[i].begin(), self.data[i].end(),
|
||||||
|
p + i * num_samples);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ans;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
static void PybindOfflineSourceSeparationOutput(py::module *m) {
|
||||||
|
using PyClass = OfflineSourceSeparationOutput;
|
||||||
|
py::class_<PyClass>(*m, "OfflineSourceSeparationOutput")
|
||||||
|
.def_property_readonly(
|
||||||
|
"sample_rate", [](const PyClass &self) { return self.sample_rate; })
|
||||||
|
.def_property_readonly("stems",
|
||||||
|
[](const PyClass &self) { return self.stems; });
|
||||||
|
}
|
||||||
|
|
||||||
|
void PybindOfflineSourceSeparation(py::module *m) {
|
||||||
|
PybindOfflineSourceSeparationConfig(m);
|
||||||
|
PybindOfflineSourceSeparationOutput(m);
|
||||||
|
|
||||||
|
PybindMultiChannelSamples(m);
|
||||||
|
|
||||||
|
using PyClass = OfflineSourceSeparation;
|
||||||
|
py::class_<PyClass>(*m, "OfflineSourceSeparation")
|
||||||
|
.def(py::init<const OfflineSourceSeparationConfig &>(),
|
||||||
|
py::arg("config") = OfflineSourceSeparationConfig{})
|
||||||
|
.def(
|
||||||
|
"process",
|
||||||
|
[](const PyClass &self, int32_t sample_rate,
|
||||||
|
const py::array_t<float> &samples) {
|
||||||
|
if (!(C_CONTIGUOUS == (samples.flags() & C_CONTIGUOUS))) {
|
||||||
|
throw py::value_error(
|
||||||
|
"input samples should be contiguous. Please use "
|
||||||
|
"np.ascontiguousarray(samples)");
|
||||||
|
}
|
||||||
|
|
||||||
|
int num_dim = samples.ndim();
|
||||||
|
if (samples.ndim() != 2) {
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "Expect an array of 2 dimensions [num_channels x "
|
||||||
|
"num_samples]. "
|
||||||
|
"Given dim: "
|
||||||
|
<< num_dim << "\n";
|
||||||
|
throw py::value_error(os.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// if num_samples is less than 10, it is very likely the user
|
||||||
|
// has swapped num_channels and num_samples.
|
||||||
|
if (samples.shape(1) < 10) {
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "Expect an array of 2 dimensions [num_channels x "
|
||||||
|
"num_samples]. "
|
||||||
|
"Given ["
|
||||||
|
<< samples.shape(0) << " x " << samples.shape(1) << "]"
|
||||||
|
<< "\n";
|
||||||
|
throw py::value_error(os.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t num_channels = samples.shape(0);
|
||||||
|
int32_t num_samples = samples.shape(1);
|
||||||
|
const float *p = samples.data();
|
||||||
|
|
||||||
|
OfflineSourceSeparationInput input;
|
||||||
|
|
||||||
|
input.samples.data.resize(num_channels);
|
||||||
|
input.sample_rate = sample_rate;
|
||||||
|
|
||||||
|
for (int32_t i = 0; i != num_channels; ++i) {
|
||||||
|
input.samples.data[i] = {p + i * num_samples,
|
||||||
|
p + (i + 1) * num_samples};
|
||||||
|
}
|
||||||
|
|
||||||
|
pybind11::gil_scoped_release release;
|
||||||
|
|
||||||
|
return self.Process(input);
|
||||||
|
},
|
||||||
|
py::arg("sample_rate"), py::arg("samples"),
|
||||||
|
"samples is of shape (num_channels, num-samples) with dtype "
|
||||||
|
"np.float32");
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace sherpa_onnx
|
||||||
16
sherpa-onnx/python/csrc/offline-source-separation.h
Normal file
16
sherpa-onnx/python/csrc/offline-source-separation.h
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
// sherpa-onnx/python/csrc/offline-source-separation-config.h
|
||||||
|
//
|
||||||
|
// Copyright (c) 2025 Xiaomi Corporation
|
||||||
|
|
||||||
|
#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_SOURCE_SEPARATION_CONFIG_H_
|
||||||
|
#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_SOURCE_SEPARATION_CONFIG_H_
|
||||||
|
|
||||||
|
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
|
||||||
|
|
||||||
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
void PybindOfflineSourceSeparation(py::module *m);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_SOURCE_SEPARATION_CONFIG_H_
|
||||||
@@ -47,6 +47,7 @@ void PybindOfflineSpeechDenoiser(py::module *m) {
|
|||||||
int32_t sample_rate) {
|
int32_t sample_rate) {
|
||||||
return self.Run(samples.data(), samples.size(), sample_rate);
|
return self.Run(samples.data(), samples.size(), sample_rate);
|
||||||
},
|
},
|
||||||
|
py::arg("samples"), py::arg("sample_rate"),
|
||||||
py::call_guard<py::gil_scoped_release>())
|
py::call_guard<py::gil_scoped_release>())
|
||||||
.def(
|
.def(
|
||||||
"run",
|
"run",
|
||||||
@@ -54,6 +55,7 @@ void PybindOfflineSpeechDenoiser(py::module *m) {
|
|||||||
int32_t sample_rate) {
|
int32_t sample_rate) {
|
||||||
return self.Run(samples.data(), samples.size(), sample_rate);
|
return self.Run(samples.data(), samples.size(), sample_rate);
|
||||||
},
|
},
|
||||||
|
py::arg("samples"), py::arg("sample_rate"),
|
||||||
py::call_guard<py::gil_scoped_release>())
|
py::call_guard<py::gil_scoped_release>())
|
||||||
.def_property_readonly("sample_rate", &PyClass::GetSampleRate);
|
.def_property_readonly("sample_rate", &PyClass::GetSampleRate);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -109,19 +109,20 @@ void PybindOnlineRecognizer(py::module *m) {
|
|||||||
py::arg("hotwords"), py::call_guard<py::gil_scoped_release>())
|
py::arg("hotwords"), py::call_guard<py::gil_scoped_release>())
|
||||||
.def("is_ready", &PyClass::IsReady,
|
.def("is_ready", &PyClass::IsReady,
|
||||||
py::call_guard<py::gil_scoped_release>())
|
py::call_guard<py::gil_scoped_release>())
|
||||||
.def("decode_stream", &PyClass::DecodeStream,
|
.def("decode_stream", &PyClass::DecodeStream, py::arg("s"),
|
||||||
py::call_guard<py::gil_scoped_release>())
|
py::call_guard<py::gil_scoped_release>())
|
||||||
.def(
|
.def(
|
||||||
"decode_streams",
|
"decode_streams",
|
||||||
[](PyClass &self, std::vector<OnlineStream *> ss) {
|
[](PyClass &self, std::vector<OnlineStream *> ss) {
|
||||||
self.DecodeStreams(ss.data(), ss.size());
|
self.DecodeStreams(ss.data(), ss.size());
|
||||||
},
|
},
|
||||||
py::call_guard<py::gil_scoped_release>())
|
py::arg("ss"), py::call_guard<py::gil_scoped_release>())
|
||||||
.def("get_result", &PyClass::GetResult,
|
.def("get_result", &PyClass::GetResult, py::arg("s"),
|
||||||
py::call_guard<py::gil_scoped_release>())
|
py::call_guard<py::gil_scoped_release>())
|
||||||
.def("is_endpoint", &PyClass::IsEndpoint,
|
.def("is_endpoint", &PyClass::IsEndpoint, py::arg("s"),
|
||||||
py::call_guard<py::gil_scoped_release>())
|
py::call_guard<py::gil_scoped_release>())
|
||||||
.def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>());
|
.def("reset", &PyClass::Reset, py::arg("s"),
|
||||||
|
py::call_guard<py::gil_scoped_release>());
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace sherpa_onnx
|
} // namespace sherpa_onnx
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
#include "sherpa-onnx/python/csrc/offline-model-config.h"
|
#include "sherpa-onnx/python/csrc/offline-model-config.h"
|
||||||
#include "sherpa-onnx/python/csrc/offline-punctuation.h"
|
#include "sherpa-onnx/python/csrc/offline-punctuation.h"
|
||||||
#include "sherpa-onnx/python/csrc/offline-recognizer.h"
|
#include "sherpa-onnx/python/csrc/offline-recognizer.h"
|
||||||
|
#include "sherpa-onnx/python/csrc/offline-source-separation.h"
|
||||||
#include "sherpa-onnx/python/csrc/offline-speech-denoiser.h"
|
#include "sherpa-onnx/python/csrc/offline-speech-denoiser.h"
|
||||||
#include "sherpa-onnx/python/csrc/offline-stream.h"
|
#include "sherpa-onnx/python/csrc/offline-stream.h"
|
||||||
#include "sherpa-onnx/python/csrc/online-ctc-fst-decoder-config.h"
|
#include "sherpa-onnx/python/csrc/online-ctc-fst-decoder-config.h"
|
||||||
@@ -110,6 +111,7 @@ PYBIND11_MODULE(_sherpa_onnx, m) {
|
|||||||
|
|
||||||
PybindAlsa(&m);
|
PybindAlsa(&m);
|
||||||
PybindOfflineSpeechDenoiser(&m);
|
PybindOfflineSpeechDenoiser(&m);
|
||||||
|
PybindOfflineSourceSeparation(&m);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace sherpa_onnx
|
} // namespace sherpa_onnx
|
||||||
|
|||||||
@@ -11,6 +11,11 @@ from _sherpa_onnx import (
|
|||||||
OfflinePunctuation,
|
OfflinePunctuation,
|
||||||
OfflinePunctuationConfig,
|
OfflinePunctuationConfig,
|
||||||
OfflinePunctuationModelConfig,
|
OfflinePunctuationModelConfig,
|
||||||
|
OfflineSourceSeparation,
|
||||||
|
OfflineSourceSeparationConfig,
|
||||||
|
OfflineSourceSeparationModelConfig,
|
||||||
|
OfflineSourceSeparationSpleeterModelConfig,
|
||||||
|
OfflineSourceSeparationUvrModelConfig,
|
||||||
OfflineSpeakerDiarization,
|
OfflineSpeakerDiarization,
|
||||||
OfflineSpeakerDiarizationConfig,
|
OfflineSpeakerDiarizationConfig,
|
||||||
OfflineSpeakerDiarizationResult,
|
OfflineSpeakerDiarizationResult,
|
||||||
|
|||||||
Reference in New Issue
Block a user