Add C++ support for UVR models (#2269)
This commit is contained in:
58
.github/scripts/test-offline-source-separation.sh
vendored
Executable file
58
.github/scripts/test-offline-source-separation.sh
vendored
Executable file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
log() {
|
||||
# This function is from espnet
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
if [ -z $EXE ]; then
|
||||
EXE=./build/bin/sherpa-onnx-offline-source-separation
|
||||
fi
|
||||
|
||||
echo "EXE is $EXE"
|
||||
echo "PATH: $PATH"
|
||||
|
||||
which $EXE
|
||||
|
||||
log "------------------------------------------------------------"
|
||||
log "Run spleeter"
|
||||
log "------------------------------------------------------------"
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/sherpa-onnx-spleeter-2stems-fp16.tar.bz2
|
||||
tar xvf sherpa-onnx-spleeter-2stems-fp16.tar.bz2
|
||||
rm sherpa-onnx-spleeter-2stems-fp16.tar.bz2
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/qi-feng-le-zh.wav
|
||||
|
||||
$EXE \
|
||||
--spleeter-vocals=sherpa-onnx-spleeter-2stems-fp16/vocals.fp16.onnx \
|
||||
--spleeter-accompaniment=sherpa-onnx-spleeter-2stems-fp16/accompaniment.fp16.onnx \
|
||||
--num-threads=2 \
|
||||
--debug=1 \
|
||||
--input-wav=./qi-feng-le-zh.wav \
|
||||
--output-vocals-wav=spleeter_output_vocals.wav \
|
||||
--output-accompaniment-wav=spleeter_output_accompaniment.wav
|
||||
|
||||
rm -rf sherpa-onnx-spleeter-2stems-fp16
|
||||
|
||||
log "------------------------------------------------------------"
|
||||
log "Run UVR"
|
||||
log "------------------------------------------------------------"
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/UVR-MDX-NET-Voc_FT.onnx
|
||||
|
||||
$EXE \
|
||||
--debug=1 \
|
||||
--num-threads=2 \
|
||||
--uvr-model=./UVR-MDX-NET-Voc_FT.onnx \
|
||||
--input-wav=./qi-feng-le-zh.wav \
|
||||
--output-vocals-wav=uvr_output_vocals.wav \
|
||||
--output-accompaniment-wav=uvr_output_non_vocals.wav
|
||||
|
||||
rm ./UVR-MDX-NET-Voc_FT.onnx \
|
||||
|
||||
mkdir source-separation-wavs
|
||||
mv qi-feng-le-zh.wav source-separation-wavs
|
||||
mv spleeter_*.wav ./source-separation-wavs
|
||||
mv uvr_*.wav ./source-separation-wavs
|
||||
16
.github/workflows/linux.yaml
vendored
16
.github/workflows/linux.yaml
vendored
@@ -11,6 +11,7 @@ on:
|
||||
- '.github/scripts/test-kws.sh'
|
||||
- '.github/scripts/test-online-transducer.sh'
|
||||
- '.github/scripts/test-offline-speech-denoiser.sh'
|
||||
- '.github/scripts/test-offline-source-separation.sh'
|
||||
- '.github/scripts/test-online-paraformer.sh'
|
||||
- '.github/scripts/test-offline-transducer.sh'
|
||||
- '.github/scripts/test-offline-ctc.sh'
|
||||
@@ -33,6 +34,7 @@ on:
|
||||
- '.github/workflows/linux.yaml'
|
||||
- '.github/scripts/test-kws.sh'
|
||||
- '.github/scripts/test-offline-speech-denoiser.sh'
|
||||
- '.github/scripts/test-offline-source-separation.sh'
|
||||
- '.github/scripts/test-online-transducer.sh'
|
||||
- '.github/scripts/test-online-paraformer.sh'
|
||||
- '.github/scripts/test-offline-transducer.sh'
|
||||
@@ -205,6 +207,20 @@ jobs:
|
||||
overwrite: true
|
||||
file: sherpa-onnx-*.tar.bz2
|
||||
|
||||
- name: Test offline source separation
|
||||
shell: bash
|
||||
run: |
|
||||
du -h -d1 .
|
||||
export PATH=$PWD/build/bin:$PATH
|
||||
export EXE=sherpa-onnx-offline-source-separation
|
||||
|
||||
.github/scripts/test-offline-source-separation.sh
|
||||
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: source-separation-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }}
|
||||
path: ./source-separation-wavs/*.wav
|
||||
|
||||
- name: Test offline CTC
|
||||
shell: bash
|
||||
run: |
|
||||
|
||||
16
README.md
16
README.md
@@ -1,8 +1,8 @@
|
||||
### Supported functions
|
||||
|
||||
|Speech recognition| Speech synthesis |
|
||||
|------------------|------------------|
|
||||
| ✔️ | ✔️ |
|
||||
|Speech recognition| Speech synthesis | Source separation |
|
||||
|------------------|------------------|-------------------|
|
||||
| ✔️ | ✔️ | ✔️ |
|
||||
|
||||
|Speaker identification| Speaker diarization | Speaker verification |
|
||||
|----------------------|-------------------- |------------------------|
|
||||
@@ -16,6 +16,7 @@
|
||||
|------------------|-----------------|--------------------|
|
||||
| ✔️ | ✔️ | ✔️ |
|
||||
|
||||
|
||||
### Supported platforms
|
||||
|
||||
|Architecture| Android | iOS | Windows | macOS | linux | HarmonyOS |
|
||||
@@ -56,7 +57,9 @@ This repository supports running the following functions **locally**
|
||||
- Spoken language identification
|
||||
- Audio tagging
|
||||
- VAD (e.g., [silero-vad][silero-vad])
|
||||
- Speech enhancement (e.g., [gtcrn][gtcrn])
|
||||
- Keyword spotting
|
||||
- Source separation (e.g., [spleeter][spleeter], [UVR][UVR])
|
||||
|
||||
on the following platforms and operating systems:
|
||||
|
||||
@@ -75,6 +78,7 @@ on the following platforms and operating systems:
|
||||
- [VisionFive 2][VisionFive 2]
|
||||
- [旭日X3派][旭日X3派]
|
||||
- [爱芯派][爱芯派]
|
||||
- [RK3588][RK3588]
|
||||
- etc
|
||||
|
||||
with the following APIs
|
||||
@@ -200,6 +204,7 @@ We also have spaces built using WebAssembly. They are listed below:
|
||||
| Punctuation | [Address][punct-models] |
|
||||
| Speaker segmentation | [Address][speaker-segmentation-models] |
|
||||
| Speech enhancement | [Address][speech-enhancement-models] |
|
||||
| Source separation | [Address][source-separation-models] |
|
||||
|
||||
</details>
|
||||
|
||||
@@ -481,3 +486,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech.
|
||||
[NVIDIA Jetson Orin NX]: https://developer.download.nvidia.com/assets/embedded/secure/jetson/orin_nx/docs/Jetson_Orin_NX_DS-10712-001_v0.5.pdf?RCPGu9Q6OVAOv7a7vgtwc9-BLScXRIWq6cSLuditMALECJ_dOj27DgnqAPGVnT2VpiNpQan9SyFy-9zRykR58CokzbXwjSA7Gj819e91AXPrWkGZR3oS1VLxiDEpJa_Y0lr7UT-N4GnXtb8NlUkP4GkCkkF_FQivGPrAucCUywL481GH_WpP_p7ziHU1Wg==&t=eyJscyI6ImdzZW8iLCJsc2QiOiJodHRwczovL3d3dy5nb29nbGUuY29tLmhrLyJ9
|
||||
[NVIDIA Jetson Nano B01]: https://www.seeedstudio.com/blog/2020/01/16/new-revision-of-jetson-nano-dev-kit-now-supports-new-jetson-nano-module/
|
||||
[speech-enhancement-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speech-enhancement-models
|
||||
[source-separation-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/source-separation-models
|
||||
[RK3588]: https://www.rock-chips.com/uploads/pdf/2022.8.26/192/RK3588%20Brief%20Datasheet.pdf
|
||||
[spleeter]: https://github.com/deezer/spleeter
|
||||
[UVR]: https://github.com/Anjok07/ultimatevocalremovergui
|
||||
[gtcrn]: https://github.com/Xiaobin-Rong/gtcrn
|
||||
|
||||
@@ -136,8 +136,8 @@ int32_t main() {
|
||||
fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate);
|
||||
mic_sample_rate = atof(sample_rate_str);
|
||||
}
|
||||
if(!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
|
||||
nullptr) == false) {
|
||||
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
|
||||
nullptr) == false) {
|
||||
std::cerr << "Failed to open microphone device\n";
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
#include <iostream>
|
||||
#include <mutex> // NOLINT
|
||||
#include <queue>
|
||||
#include <thread>
|
||||
#include <thread> // NOLINT
|
||||
#include <vector>
|
||||
|
||||
#include "portaudio.h" // NOLINT
|
||||
|
||||
@@ -143,7 +143,7 @@ int32_t main() {
|
||||
lowpass_cutoff, lowpass_filter_width);
|
||||
}
|
||||
if (mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
|
||||
nullptr) == false) {
|
||||
nullptr) == false) {
|
||||
std::cerr << "Failed to open microphone device\n";
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
// cxx-api-examples/sherpa-display.cc
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
#pragma once
|
||||
|
||||
#include <stdlib.h>
|
||||
@@ -6,6 +8,8 @@
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace sherpa_onnx::cxx {
|
||||
|
||||
|
||||
@@ -159,14 +159,15 @@ static sherpa_onnx::OnlineRecognizerConfig GetOnlineRecognizerConfig(
|
||||
recognizer_config.hr.rule_fsts = SHERPA_ONNX_OR(config->hr.rule_fsts, "");
|
||||
|
||||
if (config->model_config.debug) {
|
||||
#if __OHOS__
|
||||
auto str_vec = sherpa_onnx::SplitString(recognizer_config.ToString(), 128);
|
||||
for (const auto &s : str_vec) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("%{public}s\n", s.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("%s\n", s.c_str());
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("%s", recognizer_config.ToString().c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
return recognizer_config;
|
||||
@@ -507,14 +508,15 @@ static sherpa_onnx::OfflineRecognizerConfig GetOfflineRecognizerConfig(
|
||||
recognizer_config.hr.rule_fsts = SHERPA_ONNX_OR(config->hr.rule_fsts, "");
|
||||
|
||||
if (config->model_config.debug) {
|
||||
#if __OHOS__
|
||||
auto str_vec = sherpa_onnx::SplitString(recognizer_config.ToString(), 128);
|
||||
for (const auto &s : str_vec) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("%{public}s\n", s.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("%s\n", s.c_str());
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("%s", recognizer_config.ToString().c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
return recognizer_config;
|
||||
|
||||
@@ -55,6 +55,8 @@ set(sources
|
||||
offline-source-separation-model-config.cc
|
||||
offline-source-separation-spleeter-model-config.cc
|
||||
offline-source-separation-spleeter-model.cc
|
||||
offline-source-separation-uvr-model-config.cc
|
||||
offline-source-separation-uvr-model.cc
|
||||
offline-source-separation.cc
|
||||
|
||||
offline-stream.cc
|
||||
|
||||
@@ -25,9 +25,7 @@ Microphone::~Microphone() {
|
||||
}
|
||||
}
|
||||
|
||||
int Microphone::GetDeviceCount() const {
|
||||
return Pa_GetDeviceCount();
|
||||
}
|
||||
int Microphone::GetDeviceCount() const { return Pa_GetDeviceCount(); }
|
||||
|
||||
int Microphone::GetDefaultInputDevice() const {
|
||||
return Pa_GetDefaultInputDevice();
|
||||
@@ -43,7 +41,8 @@ void Microphone::PrintDevices(int device_index) const {
|
||||
}
|
||||
}
|
||||
|
||||
bool Microphone::OpenDevice(int index, int sample_rate, int channel, PaStreamCallback cb, void* userdata) {
|
||||
bool Microphone::OpenDevice(int index, int sample_rate, int channel,
|
||||
PaStreamCallback cb, void *userdata) {
|
||||
if (index < 0 || index >= Pa_GetDeviceCount()) {
|
||||
fprintf(stderr, "Invalid device index: %d\n", index);
|
||||
return false;
|
||||
@@ -68,7 +67,8 @@ bool Microphone::OpenDevice(int index, int sample_rate, int channel, PaStreamCal
|
||||
param.suggestedLatency = info->defaultLowInputLatency;
|
||||
param.hostApiSpecificStreamInfo = nullptr;
|
||||
|
||||
PaError err = Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */
|
||||
PaError err =
|
||||
Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */
|
||||
sample_rate,
|
||||
0, // frames per buffer
|
||||
paClipOff, // we won't output out of range samples
|
||||
|
||||
@@ -4,22 +4,27 @@
|
||||
|
||||
#ifndef SHERPA_ONNX_CSRC_MICROPHONE_H_
|
||||
#define SHERPA_ONNX_CSRC_MICROPHONE_H_
|
||||
#include "portaudio.h" // NOLINT
|
||||
#include <cstdint>
|
||||
|
||||
#include "portaudio.h" // NOLINT
|
||||
namespace sherpa_onnx {
|
||||
|
||||
class Microphone {
|
||||
PaStream *stream = nullptr;
|
||||
public:
|
||||
Microphone();
|
||||
~Microphone();
|
||||
|
||||
int GetDeviceCount() const;
|
||||
int GetDefaultInputDevice() const;
|
||||
void PrintDevices(int sel) const;
|
||||
|
||||
bool OpenDevice(int index, int sample_rate, int channel, PaStreamCallback cb, void* userdata);
|
||||
int32_t GetDeviceCount() const;
|
||||
int32_t GetDefaultInputDevice() const;
|
||||
void PrintDevices(int32_t sel) const;
|
||||
|
||||
bool OpenDevice(int32_t index, int32_t sample_rate, int32_t channel,
|
||||
PaStreamCallback cb, void *userdata);
|
||||
|
||||
void CloseDevice();
|
||||
|
||||
private:
|
||||
PaStream *stream = nullptr;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -4,7 +4,9 @@
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-source-separation-impl.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
#include "android/asset_manager.h"
|
||||
@@ -16,22 +18,93 @@
|
||||
#endif
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-source-separation-spleeter-impl.h"
|
||||
#include "sherpa-onnx/csrc/offline-source-separation-uvr-impl.h"
|
||||
#include "sherpa-onnx/csrc/resample.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
std::unique_ptr<OfflineSourceSeparationImpl>
|
||||
OfflineSourceSeparationImpl::Create(
|
||||
const OfflineSourceSeparationConfig &config) {
|
||||
// TODO(fangjun): Support other models
|
||||
return std::make_unique<OfflineSourceSeparationSpleeterImpl>(config);
|
||||
if (!config.model.spleeter.vocals.empty()) {
|
||||
return std::make_unique<OfflineSourceSeparationSpleeterImpl>(config);
|
||||
}
|
||||
|
||||
if (!config.model.uvr.model.empty()) {
|
||||
return std::make_unique<OfflineSourceSeparationUvrImpl>(config);
|
||||
}
|
||||
|
||||
SHERPA_ONNX_LOGE("Please provide a separation model!");
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <typename Manager>
|
||||
std::unique_ptr<OfflineSourceSeparationImpl>
|
||||
OfflineSourceSeparationImpl::Create(
|
||||
Manager *mgr, const OfflineSourceSeparationConfig &config) {
|
||||
// TODO(fangjun): Support other models
|
||||
return std::make_unique<OfflineSourceSeparationSpleeterImpl>(mgr, config);
|
||||
if (!config.model.spleeter.vocals.empty()) {
|
||||
return std::make_unique<OfflineSourceSeparationSpleeterImpl>(mgr, config);
|
||||
}
|
||||
|
||||
if (!config.model.uvr.model.empty()) {
|
||||
return std::make_unique<OfflineSourceSeparationUvrImpl>(mgr, config);
|
||||
}
|
||||
|
||||
SHERPA_ONNX_LOGE("Please provide a separation model!");
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
OfflineSourceSeparationInput OfflineSourceSeparationImpl::Resample(
|
||||
const OfflineSourceSeparationInput &input, bool debug /*= false*/) const {
|
||||
const OfflineSourceSeparationInput *p_input = &input;
|
||||
OfflineSourceSeparationInput tmp_input;
|
||||
|
||||
int32_t output_sample_rate = GetOutputSampleRate();
|
||||
|
||||
if (input.sample_rate != output_sample_rate) {
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Creating a resampler:\n"
|
||||
" in_sample_rate: %d\n"
|
||||
" output_sample_rate: %d\n",
|
||||
input.sample_rate, output_sample_rate);
|
||||
|
||||
float min_freq = std::min<int32_t>(input.sample_rate, output_sample_rate);
|
||||
float lowpass_cutoff = 0.99 * 0.5 * min_freq;
|
||||
|
||||
int32_t lowpass_filter_width = 6;
|
||||
auto resampler =
|
||||
std::make_unique<LinearResample>(input.sample_rate, output_sample_rate,
|
||||
lowpass_cutoff, lowpass_filter_width);
|
||||
|
||||
std::vector<float> s;
|
||||
for (const auto &samples : input.samples.data) {
|
||||
resampler->Reset();
|
||||
resampler->Resample(samples.data(), samples.size(), true, &s);
|
||||
tmp_input.samples.data.push_back(std::move(s));
|
||||
}
|
||||
|
||||
tmp_input.sample_rate = output_sample_rate;
|
||||
p_input = &tmp_input;
|
||||
}
|
||||
|
||||
if (p_input->samples.data.size() > 1) {
|
||||
if (debug) {
|
||||
SHERPA_ONNX_LOGE("input ch1 samples size: %d",
|
||||
static_cast<int32_t>(p_input->samples.data[1].size()));
|
||||
}
|
||||
|
||||
if (p_input->samples.data[0].size() != p_input->samples.data[1].size()) {
|
||||
SHERPA_ONNX_LOGE("ch0 samples size %d vs ch1 samples size %d",
|
||||
static_cast<int32_t>(p_input->samples.data[0].size()),
|
||||
static_cast<int32_t>(p_input->samples.data[1].size()));
|
||||
|
||||
SHERPA_ONNX_EXIT(-1);
|
||||
}
|
||||
}
|
||||
|
||||
return *p_input;
|
||||
}
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_IMPL_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_IMPL_H_
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-source-separation.h"
|
||||
@@ -28,6 +29,9 @@ class OfflineSourceSeparationImpl {
|
||||
virtual int32_t GetOutputSampleRate() const = 0;
|
||||
|
||||
virtual int32_t GetNumberOfStems() const = 0;
|
||||
|
||||
OfflineSourceSeparationInput Resample(
|
||||
const OfflineSourceSeparationInput &input, bool debug = false) const;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -4,10 +4,13 @@
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-source-separation-model-config.h"
|
||||
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
void OfflineSourceSeparationModelConfig::Register(ParseOptions *po) {
|
||||
spleeter.Register(po);
|
||||
uvr.Register(po);
|
||||
|
||||
po->Register("num-threads", &num_threads,
|
||||
"Number of threads to run the neural network");
|
||||
@@ -20,7 +23,17 @@ void OfflineSourceSeparationModelConfig::Register(ParseOptions *po) {
|
||||
}
|
||||
|
||||
bool OfflineSourceSeparationModelConfig::Validate() const {
|
||||
return spleeter.Validate();
|
||||
if (!spleeter.vocals.empty()) {
|
||||
return spleeter.Validate();
|
||||
}
|
||||
|
||||
if (!uvr.model.empty()) {
|
||||
return uvr.Validate();
|
||||
}
|
||||
|
||||
SHERPA_ONNX_LOGE("Please specify a source separation model");
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string OfflineSourceSeparationModelConfig::ToString() const {
|
||||
@@ -28,6 +41,7 @@ std::string OfflineSourceSeparationModelConfig::ToString() const {
|
||||
|
||||
os << "OfflineSourceSeparationModelConfig(";
|
||||
os << "spleeter=" << spleeter.ToString() << ", ";
|
||||
os << "uvr=" << uvr.ToString() << ", ";
|
||||
os << "num_threads=" << num_threads << ", ";
|
||||
os << "debug=" << (debug ? "True" : "False") << ", ";
|
||||
os << "provider=\"" << provider << "\")";
|
||||
|
||||
@@ -8,12 +8,14 @@
|
||||
#include <string>
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-source-separation-spleeter-model-config.h"
|
||||
#include "sherpa-onnx/csrc/offline-source-separation-uvr-model-config.h"
|
||||
#include "sherpa-onnx/csrc/parse-options.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
struct OfflineSourceSeparationModelConfig {
|
||||
OfflineSourceSeparationSpleeterModelConfig spleeter;
|
||||
OfflineSourceSeparationUvrModelConfig uvr;
|
||||
|
||||
int32_t num_threads = 1;
|
||||
bool debug = false;
|
||||
@@ -23,8 +25,10 @@ struct OfflineSourceSeparationModelConfig {
|
||||
|
||||
OfflineSourceSeparationModelConfig(
|
||||
const OfflineSourceSeparationSpleeterModelConfig &spleeter,
|
||||
int32_t num_threads, bool debug, const std::string &provider)
|
||||
const OfflineSourceSeparationUvrModelConfig &uvr, int32_t num_threads,
|
||||
bool debug, const std::string &provider)
|
||||
: spleeter(spleeter),
|
||||
uvr(uvr),
|
||||
num_threads(num_threads),
|
||||
debug(debug),
|
||||
provider(provider) {}
|
||||
|
||||
@@ -5,6 +5,10 @@
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_SPLEETER_IMPL_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_SPLEETER_IMPL_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "Eigen/Dense"
|
||||
#include "kaldi-native-fbank/csrc/istft.h"
|
||||
#include "kaldi-native-fbank/csrc/stft.h"
|
||||
@@ -12,13 +16,12 @@
|
||||
#include "sherpa-onnx/csrc/offline-source-separation-spleeter-model.h"
|
||||
#include "sherpa-onnx/csrc/offline-source-separation.h"
|
||||
#include "sherpa-onnx/csrc/onnx-utils.h"
|
||||
#include "sherpa-onnx/csrc/resample.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
class OfflineSourceSeparationSpleeterImpl : public OfflineSourceSeparationImpl {
|
||||
public:
|
||||
OfflineSourceSeparationSpleeterImpl(
|
||||
explicit OfflineSourceSeparationSpleeterImpl(
|
||||
const OfflineSourceSeparationConfig &config)
|
||||
: config_(config), model_(config_.model) {}
|
||||
|
||||
@@ -28,56 +31,12 @@ class OfflineSourceSeparationSpleeterImpl : public OfflineSourceSeparationImpl {
|
||||
: config_(config), model_(mgr, config_.model) {}
|
||||
|
||||
OfflineSourceSeparationOutput Process(
|
||||
const OfflineSourceSeparationInput &input) const override {
|
||||
const OfflineSourceSeparationInput *p_input = &input;
|
||||
OfflineSourceSeparationInput tmp_input;
|
||||
const OfflineSourceSeparationInput &_input) const override {
|
||||
auto input = Resample(_input, config_.model.debug);
|
||||
|
||||
int32_t output_sample_rate = GetOutputSampleRate();
|
||||
auto stft_ch0 = ComputeStft(input, 0);
|
||||
|
||||
if (input.sample_rate != output_sample_rate) {
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Creating a resampler:\n"
|
||||
" in_sample_rate: %d\n"
|
||||
" output_sample_rate: %d\n",
|
||||
input.sample_rate, output_sample_rate);
|
||||
|
||||
float min_freq = std::min<int32_t>(input.sample_rate, output_sample_rate);
|
||||
float lowpass_cutoff = 0.99 * 0.5 * min_freq;
|
||||
|
||||
int32_t lowpass_filter_width = 6;
|
||||
auto resampler = std::make_unique<LinearResample>(
|
||||
input.sample_rate, output_sample_rate, lowpass_cutoff,
|
||||
lowpass_filter_width);
|
||||
|
||||
std::vector<float> s;
|
||||
for (const auto &samples : input.samples.data) {
|
||||
resampler->Reset();
|
||||
resampler->Resample(samples.data(), samples.size(), true, &s);
|
||||
tmp_input.samples.data.push_back(std::move(s));
|
||||
}
|
||||
|
||||
tmp_input.sample_rate = output_sample_rate;
|
||||
p_input = &tmp_input;
|
||||
}
|
||||
|
||||
if (p_input->samples.data.size() > 1) {
|
||||
if (config_.model.debug) {
|
||||
SHERPA_ONNX_LOGE("input ch1 samples size: %d",
|
||||
static_cast<int32_t>(p_input->samples.data[1].size()));
|
||||
}
|
||||
|
||||
if (p_input->samples.data[0].size() != p_input->samples.data[1].size()) {
|
||||
SHERPA_ONNX_LOGE("ch0 samples size %d vs ch1 samples size %d",
|
||||
static_cast<int32_t>(p_input->samples.data[0].size()),
|
||||
static_cast<int32_t>(p_input->samples.data[1].size()));
|
||||
|
||||
SHERPA_ONNX_EXIT(-1);
|
||||
}
|
||||
}
|
||||
|
||||
auto stft_ch0 = ComputeStft(*p_input, 0);
|
||||
|
||||
auto stft_ch1 = ComputeStft(*p_input, 1);
|
||||
auto stft_ch1 = ComputeStft(input, 1);
|
||||
knf::StftResult *p_stft_ch1 = stft_ch1.real.empty() ? &stft_ch0 : &stft_ch1;
|
||||
|
||||
int32_t num_frames = stft_ch0.num_frames;
|
||||
@@ -261,7 +220,6 @@ class OfflineSourceSeparationSpleeterImpl : public OfflineSourceSeparationImpl {
|
||||
stft_config.win_length = meta.window_length;
|
||||
stft_config.window_type = meta.window_type;
|
||||
stft_config.center = meta.center;
|
||||
stft_config.center = false;
|
||||
|
||||
return stft_config;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// sherpa-onnx/csrc/offline-source-separation-spleeter_model-config.cc
|
||||
// sherpa-onnx/csrc/offline-source-separation-spleeter-model-config.cc
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// sherpa-onnx/csrc/offline-source-separation-spleeter_model-config.h
|
||||
// sherpa-onnx/csrc/offline-source-separation-spleeter-model-config.h
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// sherpa-onnx/csrc/offline-source-separation-spleeter-model-meta-data.h
|
||||
//
|
||||
// Copyright (c) 2024 Xiaomi Corporation
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_SPLEETER_MODEL_META_DATA_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_SPLEETER_MODEL_META_DATA_H_
|
||||
|
||||
|
||||
382
sherpa-onnx/csrc/offline-source-separation-uvr-impl.h
Normal file
382
sherpa-onnx/csrc/offline-source-separation-uvr-impl.h
Normal file
@@ -0,0 +1,382 @@
|
||||
// sherpa-onnx/csrc/offline-source-separation-uvr-impl.h
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_IMPL_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_IMPL_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "Eigen/Dense"
|
||||
#include "kaldi-native-fbank/csrc/istft.h"
|
||||
#include "kaldi-native-fbank/csrc/stft.h"
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
#include "sherpa-onnx/csrc/offline-source-separation-uvr-model.h"
|
||||
#include "sherpa-onnx/csrc/offline-source-separation.h"
|
||||
#include "sherpa-onnx/csrc/onnx-utils.h"
|
||||
#include "sherpa-onnx/csrc/resample.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
class OfflineSourceSeparationUvrImpl : public OfflineSourceSeparationImpl {
|
||||
public:
|
||||
explicit OfflineSourceSeparationUvrImpl(
|
||||
const OfflineSourceSeparationConfig &config)
|
||||
: config_(config), model_(config_.model) {}
|
||||
|
||||
template <typename Manager>
|
||||
OfflineSourceSeparationUvrImpl(Manager *mgr,
|
||||
const OfflineSourceSeparationConfig &config)
|
||||
: config_(config), model_(mgr, config_.model) {}
|
||||
|
||||
OfflineSourceSeparationOutput Process(
|
||||
const OfflineSourceSeparationInput &_input) const override {
|
||||
auto input = Resample(_input, config_.model.debug);
|
||||
|
||||
auto chunks_ch0 = SplitIntoChunks(input.samples.data[0]);
|
||||
|
||||
std::vector<std::vector<float>> chunks_ch1;
|
||||
if (input.samples.data.size() > 1) {
|
||||
chunks_ch1 = SplitIntoChunks(input.samples.data[1]);
|
||||
}
|
||||
|
||||
std::vector<float> samples_ch0;
|
||||
std::vector<float> samples_ch1;
|
||||
|
||||
for (int32_t i = 0; i != static_cast<int32_t>(chunks_ch0.size()); ++i) {
|
||||
bool is_first_chunk = (i == 0);
|
||||
bool is_last_chunk = (i == static_cast<int32_t>(chunks_ch0.size()) - 1);
|
||||
|
||||
auto s = ProcessChunk(
|
||||
chunks_ch0[i],
|
||||
chunks_ch1.empty() ? std::vector<float>{} : chunks_ch1[i],
|
||||
is_first_chunk, is_last_chunk);
|
||||
|
||||
samples_ch0.insert(samples_ch0.end(), s.first.begin(), s.first.end());
|
||||
samples_ch1.insert(samples_ch1.end(), s.second.begin(), s.second.end());
|
||||
}
|
||||
|
||||
auto &vocals_ch0 = samples_ch0;
|
||||
auto &vocals_ch1 = samples_ch1;
|
||||
|
||||
std::vector<float> non_vocals_ch0(vocals_ch0.size());
|
||||
std::vector<float> non_vocals_ch1(vocals_ch1.size());
|
||||
|
||||
Eigen::Map<Eigen::VectorXf>(non_vocals_ch0.data(), non_vocals_ch0.size()) =
|
||||
Eigen::Map<Eigen::VectorXf>(input.samples.data[0].data(),
|
||||
input.samples.data[0].size())
|
||||
.array() -
|
||||
Eigen::Map<Eigen::VectorXf>(vocals_ch0.data(), vocals_ch0.size())
|
||||
.array();
|
||||
|
||||
if (input.samples.data.size() > 1) {
|
||||
Eigen::Map<Eigen::VectorXf>(non_vocals_ch1.data(),
|
||||
non_vocals_ch1.size()) =
|
||||
Eigen::Map<Eigen::VectorXf>(input.samples.data[1].data(),
|
||||
input.samples.data[1].size())
|
||||
.array() -
|
||||
Eigen::Map<Eigen::VectorXf>(vocals_ch1.data(), vocals_ch1.size())
|
||||
.array();
|
||||
} else {
|
||||
Eigen::Map<Eigen::VectorXf>(non_vocals_ch1.data(),
|
||||
non_vocals_ch1.size()) =
|
||||
Eigen::Map<Eigen::VectorXf>(input.samples.data[0].data(),
|
||||
input.samples.data[0].size())
|
||||
.array() -
|
||||
Eigen::Map<Eigen::VectorXf>(vocals_ch1.data(), vocals_ch1.size())
|
||||
.array();
|
||||
}
|
||||
|
||||
OfflineSourceSeparationOutput ans;
|
||||
ans.sample_rate = GetOutputSampleRate();
|
||||
|
||||
ans.stems.resize(2);
|
||||
ans.stems[0].data.reserve(2);
|
||||
ans.stems[1].data.reserve(2);
|
||||
|
||||
ans.stems[0].data.push_back(std::move(vocals_ch0));
|
||||
ans.stems[0].data.push_back(std::move(vocals_ch1));
|
||||
|
||||
ans.stems[1].data.push_back(std::move(non_vocals_ch0));
|
||||
ans.stems[1].data.push_back(std::move(non_vocals_ch1));
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
int32_t GetOutputSampleRate() const override {
|
||||
return model_.GetMetaData().sample_rate;
|
||||
}
|
||||
|
||||
int32_t GetNumberOfStems() const override {
|
||||
return model_.GetMetaData().num_stems;
|
||||
}
|
||||
|
||||
private:
|
||||
std::pair<std::vector<float>, std::vector<float>> ProcessChunk(
|
||||
const std::vector<float> &chunk_ch0, const std::vector<float> &chunk_ch1,
|
||||
bool is_first_chunk, bool is_last_chunk) const {
|
||||
int32_t pad0 = 0;
|
||||
|
||||
auto stft_results_ch0 = ComputeStft(chunk_ch0, &pad0);
|
||||
|
||||
int32_t pad1 = pad0;
|
||||
std::vector<knf::StftResult> stft_results_ch1;
|
||||
|
||||
if (!chunk_ch1.empty()) {
|
||||
stft_results_ch1 = ComputeStft(chunk_ch1, &pad1);
|
||||
} else {
|
||||
stft_results_ch1 = stft_results_ch0;
|
||||
}
|
||||
|
||||
const auto &meta_ = model_.GetMetaData();
|
||||
|
||||
int32_t num_frames = stft_results_ch0[0].num_frames;
|
||||
int32_t dim_f = meta_.dim_f;
|
||||
int32_t dim_t = meta_.dim_t;
|
||||
int32_t n_fft_bin = meta_.n_fft / 2 + 1;
|
||||
if (num_frames != dim_t) {
|
||||
SHERPA_ONNX_LOGE("num_frames(%d) != dim_t(%d)", num_frames, dim_t);
|
||||
SHERPA_ONNX_EXIT(-1);
|
||||
}
|
||||
|
||||
// the first 2: number of channels
|
||||
// the second 2: real and image
|
||||
std::vector<float> x(stft_results_ch0.size() * 2 * 2 * dim_f * dim_t);
|
||||
float *px = x.data();
|
||||
|
||||
for (int32_t i = 0; i != static_cast<int32_t>(stft_results_ch0.size());
|
||||
++i) {
|
||||
const auto &ch0 = stft_results_ch0[i];
|
||||
const auto &ch1 = stft_results_ch1[i];
|
||||
|
||||
const float *p_real_ch0 = ch0.real.data();
|
||||
const float *p_imag_ch0 = ch0.imag.data();
|
||||
|
||||
const float *p_real_ch1 = ch1.real.data();
|
||||
const float *p_imag_ch1 = ch1.imag.data();
|
||||
|
||||
for (int32_t j = 0; j != dim_f; ++j) {
|
||||
for (int32_t k = 0; k != num_frames; ++k) {
|
||||
*px = p_real_ch0[k * n_fft_bin + j];
|
||||
++px;
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t j = 0; j != dim_f; ++j) {
|
||||
for (int32_t k = 0; k != num_frames; ++k) {
|
||||
*px = p_imag_ch0[k * n_fft_bin + j];
|
||||
++px;
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t j = 0; j != dim_f; ++j) {
|
||||
for (int32_t k = 0; k != num_frames; ++k) {
|
||||
*px = p_real_ch1[k * n_fft_bin + j];
|
||||
++px;
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t j = 0; j != dim_f; ++j) {
|
||||
for (int32_t k = 0; k != num_frames; ++k) {
|
||||
*px = p_imag_ch1[k * n_fft_bin + j];
|
||||
++px;
|
||||
}
|
||||
}
|
||||
} // for (int32_t i = 0; i !=
|
||||
|
||||
auto memory_info =
|
||||
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
|
||||
|
||||
std::array<int64_t, 4> x_shape{
|
||||
static_cast<int32_t>(stft_results_ch0.size()) * 4 / meta_.dim_c,
|
||||
meta_.dim_c, dim_f, dim_t};
|
||||
|
||||
Ort::Value x_tensor = Ort::Value::CreateTensor(
|
||||
memory_info, x.data(), x.size(), x_shape.data(), x_shape.size());
|
||||
|
||||
Ort::Value spec = model_.Run(std::move(x_tensor));
|
||||
|
||||
const float *p_spec = spec.GetTensorData<float>();
|
||||
|
||||
for (int32_t i = 0; i != static_cast<int32_t>(stft_results_ch0.size());
|
||||
++i) {
|
||||
auto &ch0 = stft_results_ch0[i];
|
||||
auto &ch1 = stft_results_ch1[i];
|
||||
|
||||
float *p_real_ch0 = ch0.real.data();
|
||||
float *p_imag_ch0 = ch0.imag.data();
|
||||
|
||||
float *p_real_ch1 = ch1.real.data();
|
||||
float *p_imag_ch1 = ch1.imag.data();
|
||||
|
||||
for (int32_t j = 0; j != dim_f; ++j) {
|
||||
for (int32_t k = 0; k != num_frames; ++k) {
|
||||
p_real_ch0[k * n_fft_bin + j] = *p_spec;
|
||||
++p_spec;
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t j = 0; j != dim_f; ++j) {
|
||||
for (int32_t k = 0; k != num_frames; ++k) {
|
||||
p_imag_ch0[k * n_fft_bin + j] = *p_spec;
|
||||
++p_spec;
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t j = 0; j != dim_f; ++j) {
|
||||
for (int32_t k = 0; k != num_frames; ++k) {
|
||||
p_real_ch1[k * n_fft_bin + j] = *p_spec;
|
||||
++p_spec;
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t j = 0; j != dim_f; ++j) {
|
||||
for (int32_t k = 0; k != num_frames; ++k) {
|
||||
p_imag_ch1[k * n_fft_bin + j] = *p_spec;
|
||||
++p_spec;
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t k = 0; k != num_frames; ++k) {
|
||||
for (int32_t j = dim_f; j != n_fft_bin; ++j) {
|
||||
p_real_ch0[k * n_fft_bin + j] = 0;
|
||||
p_real_ch1[k * n_fft_bin + j] = 0;
|
||||
|
||||
p_imag_ch0[k * n_fft_bin + j] = 0;
|
||||
p_imag_ch1[k * n_fft_bin + j] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto samples_ch0 = ComputeInverseStft(stft_results_ch0, pad0,
|
||||
is_first_chunk, is_last_chunk);
|
||||
|
||||
auto samples_ch1 = ComputeInverseStft(stft_results_ch1, pad1,
|
||||
is_first_chunk, is_last_chunk);
|
||||
|
||||
return {std::move(samples_ch0), std::move(samples_ch1)};
|
||||
}
|
||||
|
||||
std::vector<float> ComputeInverseStft(
|
||||
const std::vector<knf::StftResult> &stft_result, int32_t pad,
|
||||
bool is_first_chunk, bool is_last_chunk) const {
|
||||
const auto &meta_ = model_.GetMetaData();
|
||||
int32_t trim = meta_.n_fft / 2;
|
||||
|
||||
int32_t margin = meta_.margin;
|
||||
|
||||
int32_t chunk_size = meta_.num_chunks * meta_.sample_rate;
|
||||
|
||||
if (margin > chunk_size) {
|
||||
margin = chunk_size;
|
||||
}
|
||||
|
||||
auto stft_config = GetStftConfig();
|
||||
knf::IStft istft(stft_config);
|
||||
|
||||
std::vector<float> ans;
|
||||
|
||||
for (int32_t i = 0; i != static_cast<int32_t>(stft_result.size()); ++i) {
|
||||
auto samples = istft.Compute(stft_result[i]);
|
||||
int32_t num_samples = static_cast<int32_t>(samples.size());
|
||||
|
||||
ans.insert(ans.end(), samples.begin() + trim,
|
||||
samples.begin() + (num_samples - trim));
|
||||
}
|
||||
|
||||
int32_t start = is_first_chunk ? 0 : margin;
|
||||
int32_t end =
|
||||
is_last_chunk ? (ans.size() - pad) : (ans.size() - pad - margin);
|
||||
|
||||
return {ans.begin() + start, ans.begin() + end};
|
||||
}
|
||||
|
||||
std::vector<knf::StftResult> ComputeStft(const std::vector<float> &chunk,
|
||||
int32_t *pad) const {
|
||||
const auto &meta_ = model_.GetMetaData();
|
||||
|
||||
int32_t num_samples = static_cast<int32_t>(chunk.size());
|
||||
int32_t trim = meta_.n_fft / 2;
|
||||
int32_t chunk_size = meta_.hop_length * (meta_.dim_t - 1);
|
||||
int32_t gen_size = chunk_size - 2 * trim;
|
||||
*pad = gen_size - num_samples % gen_size;
|
||||
|
||||
std::vector<float> samples(trim + chunk.size() + *pad + trim);
|
||||
std::copy(chunk.begin(), chunk.end(), samples.begin() + trim);
|
||||
|
||||
auto stft_config = GetStftConfig();
|
||||
knf::Stft stft(stft_config);
|
||||
|
||||
std::vector<knf::StftResult> stft_results;
|
||||
// split the chunk into short segments
|
||||
for (int32_t i = 0; i < num_samples + *pad; i += gen_size) {
|
||||
auto r = stft.Compute(samples.data() + i, chunk_size);
|
||||
stft_results.push_back(std::move(r));
|
||||
}
|
||||
|
||||
return stft_results;
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>> SplitIntoChunks(
|
||||
const std::vector<float> &samples) const {
|
||||
std::vector<std::vector<float>> ans;
|
||||
|
||||
if (samples.empty()) {
|
||||
return ans;
|
||||
}
|
||||
|
||||
const auto &meta_ = model_.GetMetaData();
|
||||
int32_t margin = meta_.margin;
|
||||
|
||||
int32_t chunk_size = meta_.num_chunks * meta_.sample_rate;
|
||||
|
||||
if (static_cast<int32_t>(samples.size()) < chunk_size) {
|
||||
chunk_size = samples.size();
|
||||
}
|
||||
|
||||
if (margin > chunk_size) {
|
||||
margin = chunk_size;
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < static_cast<int32_t>(samples.size());
|
||||
i += chunk_size) {
|
||||
int32_t start = std::max<int32_t>(0, i - margin);
|
||||
int32_t end = std::min<int32_t>(i + chunk_size + margin,
|
||||
static_cast<int32_t>(samples.size()));
|
||||
if (start >= end) {
|
||||
break;
|
||||
}
|
||||
|
||||
ans.emplace_back(samples.begin() + start, samples.begin() + end);
|
||||
|
||||
if (end == static_cast<int32_t>(samples.size())) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
knf::StftConfig GetStftConfig() const {
|
||||
const auto &meta = model_.GetMetaData();
|
||||
|
||||
knf::StftConfig stft_config;
|
||||
stft_config.n_fft = meta.n_fft;
|
||||
stft_config.hop_length = meta.hop_length;
|
||||
stft_config.win_length = meta.window_length;
|
||||
stft_config.window_type = meta.window_type;
|
||||
stft_config.center = meta.center;
|
||||
|
||||
return stft_config;
|
||||
}
|
||||
|
||||
private:
|
||||
OfflineSourceSeparationConfig config_;
|
||||
OfflineSourceSeparationUvrModel model_;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_IMPL_H_
|
||||
@@ -0,0 +1,39 @@
|
||||
// sherpa-onnx/csrc/offline-source-separation-uvr-model-config.cc
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-source-separation-uvr-model-config.h"
|
||||
|
||||
#include "sherpa-onnx/csrc/file-utils.h"
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
void OfflineSourceSeparationUvrModelConfig::Register(ParseOptions *po) {
|
||||
po->Register("uvr-model", &model, "Path to the UVR model");
|
||||
}
|
||||
|
||||
bool OfflineSourceSeparationUvrModelConfig::Validate() const {
|
||||
if (model.empty()) {
|
||||
SHERPA_ONNX_LOGE("Please provide --uvr-model");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!FileExists(model)) {
|
||||
SHERPA_ONNX_LOGE("UVR model '%s' does not exist. ", model.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string OfflineSourceSeparationUvrModelConfig::ToString() const {
|
||||
std::ostringstream os;
|
||||
|
||||
os << "OfflineSourceSeparationUvrModelConfig(";
|
||||
os << "model=\"" << model << "\")";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
@@ -0,0 +1,32 @@
|
||||
// sherpa-onnx/csrc/offline-source-separation-uvr-model-config.h
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_CONFIG_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_CONFIG_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-source-separation-uvr-model-config.h"
|
||||
#include "sherpa-onnx/csrc/parse-options.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
struct OfflineSourceSeparationUvrModelConfig {
|
||||
std::string model;
|
||||
|
||||
OfflineSourceSeparationUvrModelConfig() = default;
|
||||
|
||||
explicit OfflineSourceSeparationUvrModelConfig(const std::string &model)
|
||||
: model(model) {}
|
||||
|
||||
void Register(ParseOptions *po);
|
||||
|
||||
bool Validate() const;
|
||||
|
||||
std::string ToString() const;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_CONFIG_H_
|
||||
@@ -0,0 +1,38 @@
|
||||
// sherpa-onnx/csrc/offline-source-separation-uvr-model-meta-data.h
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_META_DATA_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_META_DATA_H_
|
||||
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
// See also
|
||||
// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/uvr_mdx/test.py
|
||||
// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/uvr_mdx/add_meta_data_and_quantize.py
|
||||
struct OfflineSourceSeparationUvrModelMetaData {
|
||||
int32_t sample_rate = 44100;
|
||||
int32_t num_stems = 2;
|
||||
int32_t dim_c = -1;
|
||||
int32_t dim_f = -1;
|
||||
int32_t dim_t = -1;
|
||||
|
||||
int32_t n_fft = -1;
|
||||
int32_t hop_length = 1024;
|
||||
|
||||
int32_t window_length = -1;
|
||||
int32_t center = 1;
|
||||
std::string window_type = "hann";
|
||||
|
||||
// the following fields are preconfigured. Please see
|
||||
// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/uvr_mdx/test.py
|
||||
int32_t margin = 0; // changed in ./offline-source-separation-uvr-model.cc
|
||||
const int32_t num_chunks = 15;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_META_DATA_H_
|
||||
172
sherpa-onnx/csrc/offline-source-separation-uvr-model.cc
Normal file
172
sherpa-onnx/csrc/offline-source-separation-uvr-model.cc
Normal file
@@ -0,0 +1,172 @@
|
||||
// sherpa-onnx/csrc/offline-source-separation-uvr-model.cc
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-source-separation-uvr-model.h"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
#include "android/asset_manager.h"
|
||||
#include "android/asset_manager_jni.h"
|
||||
#endif
|
||||
|
||||
#if __OHOS__
|
||||
#include "rawfile/raw_file_manager.h"
|
||||
#endif
|
||||
|
||||
#include "sherpa-onnx/csrc/file-utils.h"
|
||||
#include "sherpa-onnx/csrc/onnx-utils.h"
|
||||
#include "sherpa-onnx/csrc/session.h"
|
||||
#include "sherpa-onnx/csrc/text-utils.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
class OfflineSourceSeparationUvrModel::Impl {
|
||||
public:
|
||||
explicit Impl(const OfflineSourceSeparationModelConfig &config)
|
||||
: config_(config),
|
||||
env_(ORT_LOGGING_LEVEL_ERROR),
|
||||
sess_opts_(GetSessionOptions(config)),
|
||||
allocator_{} {
|
||||
auto buf = ReadFile(config.uvr.model);
|
||||
Init(buf.data(), buf.size());
|
||||
}
|
||||
|
||||
template <typename Manager>
|
||||
Impl(Manager *mgr, const OfflineSourceSeparationModelConfig &config)
|
||||
: config_(config),
|
||||
env_(ORT_LOGGING_LEVEL_ERROR),
|
||||
sess_opts_(GetSessionOptions(config)),
|
||||
allocator_{} {
|
||||
auto buf = ReadFile(mgr, config.uvr.model);
|
||||
Init(buf.data(), buf.size());
|
||||
}
|
||||
|
||||
const OfflineSourceSeparationUvrModelMetaData &GetMetaData() const {
|
||||
return meta_;
|
||||
}
|
||||
|
||||
Ort::Value Run(Ort::Value x) const {
|
||||
auto out = sess_->Run({}, input_names_ptr_.data(), &x, 1,
|
||||
output_names_ptr_.data(), output_names_ptr_.size());
|
||||
return std::move(out[0]);
|
||||
}
|
||||
|
||||
private:
|
||||
void Init(void *model_data, size_t model_data_length) {
|
||||
sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
|
||||
sess_opts_);
|
||||
|
||||
GetInputNames(sess_.get(), &input_names_, &input_names_ptr_);
|
||||
|
||||
GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_);
|
||||
|
||||
Ort::ModelMetadata meta_data = sess_->GetModelMetadata();
|
||||
if (config_.debug) {
|
||||
std::ostringstream os;
|
||||
os << "---UVR model---\n";
|
||||
PrintModelMetadata(os, meta_data);
|
||||
|
||||
os << "----------input names----------\n";
|
||||
int32_t i = 0;
|
||||
for (const auto &s : input_names_) {
|
||||
os << i << " " << s << "\n";
|
||||
++i;
|
||||
}
|
||||
os << "----------output names----------\n";
|
||||
i = 0;
|
||||
for (const auto &s : output_names_) {
|
||||
os << i << " " << s << "\n";
|
||||
++i;
|
||||
}
|
||||
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("%s\n", os.str().c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
Ort::AllocatorWithDefaultOptions allocator; // used in the macro below
|
||||
|
||||
std::string model_type;
|
||||
SHERPA_ONNX_READ_META_DATA_STR(model_type, "model_type");
|
||||
if (model_type != "UVR") {
|
||||
SHERPA_ONNX_LOGE("Expect model type 'UVR'. Given: '%s'",
|
||||
model_type.c_str());
|
||||
SHERPA_ONNX_EXIT(-1);
|
||||
}
|
||||
|
||||
SHERPA_ONNX_READ_META_DATA(meta_.num_stems, "stems");
|
||||
if (meta_.num_stems != 2) {
|
||||
SHERPA_ONNX_LOGE("Only 2stems is supported. Given %d stems",
|
||||
meta_.num_stems);
|
||||
SHERPA_ONNX_EXIT(-1);
|
||||
}
|
||||
|
||||
SHERPA_ONNX_READ_META_DATA(meta_.sample_rate, "sample_rate");
|
||||
SHERPA_ONNX_READ_META_DATA(meta_.n_fft, "n_fft");
|
||||
SHERPA_ONNX_READ_META_DATA(meta_.center, "center");
|
||||
SHERPA_ONNX_READ_META_DATA(meta_.window_length, "win_length");
|
||||
SHERPA_ONNX_READ_META_DATA(meta_.hop_length, "hop_length");
|
||||
SHERPA_ONNX_READ_META_DATA(meta_.dim_t, "dim_t");
|
||||
SHERPA_ONNX_READ_META_DATA(meta_.dim_f, "dim_f");
|
||||
SHERPA_ONNX_READ_META_DATA(meta_.dim_c, "dim_c");
|
||||
SHERPA_ONNX_READ_META_DATA_STR(meta_.window_type, "window_type");
|
||||
|
||||
meta_.margin = meta_.sample_rate;
|
||||
}
|
||||
|
||||
private:
|
||||
OfflineSourceSeparationModelConfig config_;
|
||||
OfflineSourceSeparationUvrModelMetaData meta_;
|
||||
|
||||
Ort::Env env_;
|
||||
Ort::SessionOptions sess_opts_;
|
||||
Ort::AllocatorWithDefaultOptions allocator_;
|
||||
|
||||
std::unique_ptr<Ort::Session> sess_;
|
||||
|
||||
std::vector<std::string> input_names_;
|
||||
std::vector<const char *> input_names_ptr_;
|
||||
|
||||
std::vector<std::string> output_names_;
|
||||
std::vector<const char *> output_names_ptr_;
|
||||
};
|
||||
|
||||
OfflineSourceSeparationUvrModel::~OfflineSourceSeparationUvrModel() = default;
|
||||
|
||||
OfflineSourceSeparationUvrModel::OfflineSourceSeparationUvrModel(
|
||||
const OfflineSourceSeparationModelConfig &config)
|
||||
: impl_(std::make_unique<Impl>(config)) {}
|
||||
|
||||
template <typename Manager>
|
||||
OfflineSourceSeparationUvrModel::OfflineSourceSeparationUvrModel(
|
||||
Manager *mgr, const OfflineSourceSeparationModelConfig &config)
|
||||
: impl_(std::make_unique<Impl>(mgr, config)) {}
|
||||
|
||||
Ort::Value OfflineSourceSeparationUvrModel::Run(Ort::Value x) const {
|
||||
return impl_->Run(std::move(x));
|
||||
}
|
||||
|
||||
const OfflineSourceSeparationUvrModelMetaData &
|
||||
OfflineSourceSeparationUvrModel::GetMetaData() const {
|
||||
return impl_->GetMetaData();
|
||||
}
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
template OfflineSourceSeparationUvrModel::OfflineSourceSeparationUvrModel(
|
||||
AAssetManager *mgr, const OfflineSourceSeparationModelConfig &config);
|
||||
#endif
|
||||
|
||||
#if __OHOS__
|
||||
template OfflineSourceSeparationUvrModel::OfflineSourceSeparationUvrModel(
|
||||
NativeResourceManager *mgr,
|
||||
const OfflineSourceSeparationModelConfig &config);
|
||||
#endif
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
36
sherpa-onnx/csrc/offline-source-separation-uvr-model.h
Normal file
36
sherpa-onnx/csrc/offline-source-separation-uvr-model.h
Normal file
@@ -0,0 +1,36 @@
|
||||
// sherpa-onnx/csrc/offline-source-separation-uvr-model.h
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_H_
|
||||
#include <memory>
|
||||
|
||||
#include "onnxruntime_cxx_api.h" // NOLINT
|
||||
#include "sherpa-onnx/csrc/offline-source-separation-model-config.h"
|
||||
#include "sherpa-onnx/csrc/offline-source-separation-uvr-model-meta-data.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
class OfflineSourceSeparationUvrModel {
|
||||
public:
|
||||
~OfflineSourceSeparationUvrModel();
|
||||
|
||||
explicit OfflineSourceSeparationUvrModel(
|
||||
const OfflineSourceSeparationModelConfig &config);
|
||||
|
||||
template <typename Manager>
|
||||
OfflineSourceSeparationUvrModel(
|
||||
Manager *mgr, const OfflineSourceSeparationModelConfig &config);
|
||||
|
||||
Ort::Value Run(Ort::Value x) const;
|
||||
|
||||
const OfflineSourceSeparationUvrModelMetaData &GetMetaData() const;
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_H_
|
||||
@@ -19,7 +19,8 @@ struct OfflineSourceSeparationConfig {
|
||||
|
||||
OfflineSourceSeparationConfig() = default;
|
||||
|
||||
OfflineSourceSeparationConfig(const OfflineSourceSeparationModelConfig &model)
|
||||
explicit OfflineSourceSeparationConfig(
|
||||
const OfflineSourceSeparationModelConfig &model)
|
||||
: model(model) {}
|
||||
|
||||
void Register(ParseOptions *po);
|
||||
@@ -54,7 +55,7 @@ class OfflineSourceSeparation {
|
||||
public:
|
||||
~OfflineSourceSeparation();
|
||||
|
||||
OfflineSourceSeparation(const OfflineSourceSeparationConfig &config);
|
||||
explicit OfflineSourceSeparation(const OfflineSourceSeparationConfig &config);
|
||||
|
||||
template <typename Manager>
|
||||
OfflineSourceSeparation(Manager *mgr,
|
||||
|
||||
@@ -101,8 +101,8 @@ for a list of pre-trained models to download.
|
||||
mic_sample_rate = atof(pSampleRateStr);
|
||||
}
|
||||
|
||||
if(!mic.OpenDevice(device_index, mic_sample_rate, 1,
|
||||
RecordCallback, s.get())) {
|
||||
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
|
||||
s.get())) {
|
||||
fprintf(stderr, "portaudio error: %d\n", device_index);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
@@ -142,8 +142,8 @@ for more models.
|
||||
mic_sample_rate = atof(pSampleRateStr);
|
||||
}
|
||||
|
||||
if (!mic.OpenDevice(device_index, mic_sample_rate, 1,
|
||||
RecordCallback, nullptr /* user_data */)){
|
||||
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
|
||||
nullptr /* user_data */)) {
|
||||
fprintf(stderr, "portaudio error: %d\n", device_index);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
@@ -244,8 +244,8 @@ Note that `zh` means Chinese, while `en` means English.
|
||||
mic_sample_rate = atof(pSampleRateStr);
|
||||
}
|
||||
|
||||
if (!mic.OpenDevice(device_index, mic_sample_rate, 1,
|
||||
RecordCallback, nullptr /* user_data */)){
|
||||
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
|
||||
nullptr /* user_data */)) {
|
||||
fprintf(stderr, "portaudio error: %d\n", device_index);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
@@ -159,8 +159,8 @@ for a list of pre-trained models to download.
|
||||
mic_sample_rate = atof(pSampleRateStr);
|
||||
}
|
||||
|
||||
if (!mic.OpenDevice(device_index, mic_sample_rate, 1,
|
||||
RecordCallback, nullptr /* user_data */)){
|
||||
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
|
||||
nullptr /* user_data */)) {
|
||||
fprintf(stderr, "portaudio error: %d\n", device_index);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
@@ -129,8 +129,8 @@ for a list of pre-trained models to download.
|
||||
mic_sample_rate = atof(pSampleRateStr);
|
||||
}
|
||||
|
||||
if (!mic.OpenDevice(device_index, mic_sample_rate, 1,
|
||||
RecordCallback, nullptr /* user_data */)){
|
||||
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
|
||||
nullptr /* user_data */)) {
|
||||
fprintf(stderr, "portaudio error: %d\n", device_index);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
@@ -33,6 +33,17 @@ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-m
|
||||
--input-wav=audio_example.wav \
|
||||
--output-vocals-wav=output_vocals.wav \
|
||||
--output-accompaniment-wav=output_accompaniment.wav
|
||||
|
||||
(2) Use UVR models
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/UVR_MDXNET_1_9703.onnx
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/audio_example.wav
|
||||
|
||||
./bin/sherpa-onnx-offline-source-separation \
|
||||
--uvr-model=./UVR_MDXNET_1_9703.onnx \
|
||||
--input-wav=audio_example.wav \
|
||||
--output-vocals-wav=output_vocals.wav \
|
||||
--output-accompaniment-wav=output_accompaniment.wav
|
||||
)usage";
|
||||
|
||||
sherpa_onnx::ParseOptions po(kUsageMessage);
|
||||
|
||||
@@ -136,7 +136,8 @@ to download models for offline ASR.
|
||||
mic_sample_rate = atof(pSampleRateStr);
|
||||
}
|
||||
|
||||
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, nullptr)) {
|
||||
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
|
||||
nullptr)) {
|
||||
fprintf(stderr, "Failed to open device %d\n", device_index);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
@@ -74,7 +74,6 @@ wget https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/siler
|
||||
|
||||
sherpa_onnx::Microphone mic;
|
||||
|
||||
|
||||
int32_t device_index = Pa_GetDefaultInputDevice();
|
||||
if (device_index == paNoDevice) {
|
||||
fprintf(stderr, "No default input device found\n");
|
||||
@@ -96,7 +95,8 @@ wget https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/siler
|
||||
fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate);
|
||||
mic_sample_rate = atof(pSampleRateStr);
|
||||
}
|
||||
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, nullptr)) {
|
||||
if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
|
||||
nullptr)) {
|
||||
fprintf(stderr, "Failed to open microphone device %d\n", device_index);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "sherpa-onnx/csrc/offline-tts.h"
|
||||
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
#include "sherpa-onnx/csrc/text-utils.h"
|
||||
#include "sherpa-onnx/csrc/wave-writer.h"
|
||||
#include "sherpa-onnx/jni/common.h"
|
||||
|
||||
@@ -207,7 +208,10 @@ JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_newFromAsset(
|
||||
}
|
||||
#endif
|
||||
auto config = sherpa_onnx::GetOfflineTtsConfig(env, _config);
|
||||
SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str());
|
||||
auto str_vec = sherpa_onnx::SplitString(config.ToString(), 128);
|
||||
for (const auto &s : str_vec) {
|
||||
SHERPA_ONNX_LOGE("%s", s.c_str());
|
||||
}
|
||||
|
||||
auto tts = new sherpa_onnx::OfflineTts(
|
||||
#if __ANDROID_API__ >= 9
|
||||
|
||||
Reference in New Issue
Block a user