diff --git a/.github/scripts/test-offline-source-separation.sh b/.github/scripts/test-offline-source-separation.sh new file mode 100755 index 00000000..d65c75df --- /dev/null +++ b/.github/scripts/test-offline-source-separation.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash + +set -ex + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +if [ -z $EXE ]; then + EXE=./build/bin/sherpa-onnx-offline-source-separation +fi + +echo "EXE is $EXE" +echo "PATH: $PATH" + +which $EXE + +log "------------------------------------------------------------" +log "Run spleeter" +log "------------------------------------------------------------" +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/sherpa-onnx-spleeter-2stems-fp16.tar.bz2 +tar xvf sherpa-onnx-spleeter-2stems-fp16.tar.bz2 +rm sherpa-onnx-spleeter-2stems-fp16.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/qi-feng-le-zh.wav + +$EXE \ + --spleeter-vocals=sherpa-onnx-spleeter-2stems-fp16/vocals.fp16.onnx \ + --spleeter-accompaniment=sherpa-onnx-spleeter-2stems-fp16/accompaniment.fp16.onnx \ + --num-threads=2 \ + --debug=1 \ + --input-wav=./qi-feng-le-zh.wav \ + --output-vocals-wav=spleeter_output_vocals.wav \ + --output-accompaniment-wav=spleeter_output_accompaniment.wav + +rm -rf sherpa-onnx-spleeter-2stems-fp16 + +log "------------------------------------------------------------" +log "Run UVR" +log "------------------------------------------------------------" +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/UVR-MDX-NET-Voc_FT.onnx + +$EXE \ + --debug=1 \ + --num-threads=2 \ + --uvr-model=./UVR-MDX-NET-Voc_FT.onnx \ + --input-wav=./qi-feng-le-zh.wav \ + --output-vocals-wav=uvr_output_vocals.wav \ + --output-accompaniment-wav=uvr_output_non_vocals.wav + +rm ./UVR-MDX-NET-Voc_FT.onnx \ + +mkdir source-separation-wavs +mv qi-feng-le-zh.wav source-separation-wavs +mv spleeter_*.wav ./source-separation-wavs +mv uvr_*.wav ./source-separation-wavs diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml index 4a5277f1..ff796e21 100644 --- a/.github/workflows/linux.yaml +++ b/.github/workflows/linux.yaml @@ -11,6 +11,7 @@ on: - '.github/scripts/test-kws.sh' - '.github/scripts/test-online-transducer.sh' - '.github/scripts/test-offline-speech-denoiser.sh' + - '.github/scripts/test-offline-source-separation.sh' - '.github/scripts/test-online-paraformer.sh' - '.github/scripts/test-offline-transducer.sh' - '.github/scripts/test-offline-ctc.sh' @@ -33,6 +34,7 @@ on: - '.github/workflows/linux.yaml' - '.github/scripts/test-kws.sh' - '.github/scripts/test-offline-speech-denoiser.sh' + - '.github/scripts/test-offline-source-separation.sh' - '.github/scripts/test-online-transducer.sh' - '.github/scripts/test-online-paraformer.sh' - '.github/scripts/test-offline-transducer.sh' @@ -205,6 +207,20 @@ jobs: overwrite: true file: sherpa-onnx-*.tar.bz2 + - name: Test offline source separation + shell: bash + run: | + du -h -d1 . + export PATH=$PWD/build/bin:$PATH + export EXE=sherpa-onnx-offline-source-separation + + .github/scripts/test-offline-source-separation.sh + + - uses: actions/upload-artifact@v4 + with: + name: source-separation-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} + path: ./source-separation-wavs/*.wav + - name: Test offline CTC shell: bash run: | diff --git a/README.md b/README.md index 6c933bc3..ff9aa725 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ ### Supported functions -|Speech recognition| Speech synthesis | -|------------------|------------------| -| ✔️ | ✔️ | +|Speech recognition| Speech synthesis | Source separation | +|------------------|------------------|-------------------| +| ✔️ | ✔️ | ✔️ | |Speaker identification| Speaker diarization | Speaker verification | |----------------------|-------------------- |------------------------| @@ -16,6 +16,7 @@ |------------------|-----------------|--------------------| | ✔️ | ✔️ | ✔️ | + ### Supported platforms |Architecture| Android | iOS | Windows | macOS | linux | HarmonyOS | @@ -56,7 +57,9 @@ This repository supports running the following functions **locally** - Spoken language identification - Audio tagging - VAD (e.g., [silero-vad][silero-vad]) + - Speech enhancement (e.g., [gtcrn][gtcrn]) - Keyword spotting + - Source separation (e.g., [spleeter][spleeter], [UVR][UVR]) on the following platforms and operating systems: @@ -75,6 +78,7 @@ on the following platforms and operating systems: - [VisionFive 2][VisionFive 2] - [旭日X3派][旭日X3派] - [爱芯派][爱芯派] + - [RK3588][RK3588] - etc with the following APIs @@ -200,6 +204,7 @@ We also have spaces built using WebAssembly. They are listed below: | Punctuation | [Address][punct-models] | | Speaker segmentation | [Address][speaker-segmentation-models] | | Speech enhancement | [Address][speech-enhancement-models] | +| Source separation | [Address][source-separation-models] | @@ -481,3 +486,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. [NVIDIA Jetson Orin NX]: https://developer.download.nvidia.com/assets/embedded/secure/jetson/orin_nx/docs/Jetson_Orin_NX_DS-10712-001_v0.5.pdf?RCPGu9Q6OVAOv7a7vgtwc9-BLScXRIWq6cSLuditMALECJ_dOj27DgnqAPGVnT2VpiNpQan9SyFy-9zRykR58CokzbXwjSA7Gj819e91AXPrWkGZR3oS1VLxiDEpJa_Y0lr7UT-N4GnXtb8NlUkP4GkCkkF_FQivGPrAucCUywL481GH_WpP_p7ziHU1Wg==&t=eyJscyI6ImdzZW8iLCJsc2QiOiJodHRwczovL3d3dy5nb29nbGUuY29tLmhrLyJ9 [NVIDIA Jetson Nano B01]: https://www.seeedstudio.com/blog/2020/01/16/new-revision-of-jetson-nano-dev-kit-now-supports-new-jetson-nano-module/ [speech-enhancement-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speech-enhancement-models +[source-separation-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/source-separation-models +[RK3588]: https://www.rock-chips.com/uploads/pdf/2022.8.26/192/RK3588%20Brief%20Datasheet.pdf +[spleeter]: https://github.com/deezer/spleeter +[UVR]: https://github.com/Anjok07/ultimatevocalremovergui +[gtcrn]: https://github.com/Xiaobin-Rong/gtcrn diff --git a/cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc b/cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc index 922ac158..bc58d9ec 100644 --- a/cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc +++ b/cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc @@ -136,8 +136,8 @@ int32_t main() { fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); mic_sample_rate = atof(sample_rate_str); } - if(!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, - nullptr) == false) { + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, + nullptr) == false) { std::cerr << "Failed to open microphone device\n"; return -1; } diff --git a/cxx-api-examples/sense-voice-simulate-streaming-alsa-cxx-api.cc b/cxx-api-examples/sense-voice-simulate-streaming-alsa-cxx-api.cc index ab096d3e..603a7098 100644 --- a/cxx-api-examples/sense-voice-simulate-streaming-alsa-cxx-api.cc +++ b/cxx-api-examples/sense-voice-simulate-streaming-alsa-cxx-api.cc @@ -24,7 +24,7 @@ #include #include // NOLINT #include -#include +#include // NOLINT #include #include "portaudio.h" // NOLINT diff --git a/cxx-api-examples/sense-voice-simulate-streaming-microphone-cxx-api.cc b/cxx-api-examples/sense-voice-simulate-streaming-microphone-cxx-api.cc index f1efc079..ed9aa891 100644 --- a/cxx-api-examples/sense-voice-simulate-streaming-microphone-cxx-api.cc +++ b/cxx-api-examples/sense-voice-simulate-streaming-microphone-cxx-api.cc @@ -143,7 +143,7 @@ int32_t main() { lowpass_cutoff, lowpass_filter_width); } if (mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, - nullptr) == false) { + nullptr) == false) { std::cerr << "Failed to open microphone device\n"; return -1; } diff --git a/cxx-api-examples/sherpa-display.h b/cxx-api-examples/sherpa-display.h index ca8d286c..8fbcdd34 100644 --- a/cxx-api-examples/sherpa-display.h +++ b/cxx-api-examples/sherpa-display.h @@ -1,3 +1,5 @@ +// cxx-api-examples/sherpa-display.cc +// Copyright (c) 2025 Xiaomi Corporation #pragma once #include @@ -6,6 +8,8 @@ #include #include #include +#include +#include namespace sherpa_onnx::cxx { diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index f38f85a0..2cfa4ac1 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -159,14 +159,15 @@ static sherpa_onnx::OnlineRecognizerConfig GetOnlineRecognizerConfig( recognizer_config.hr.rule_fsts = SHERPA_ONNX_OR(config->hr.rule_fsts, ""); if (config->model_config.debug) { +#if __OHOS__ auto str_vec = sherpa_onnx::SplitString(recognizer_config.ToString(), 128); for (const auto &s : str_vec) { -#if __OHOS__ SHERPA_ONNX_LOGE("%{public}s\n", s.c_str()); -#else SHERPA_ONNX_LOGE("%s\n", s.c_str()); -#endif } +#else + SHERPA_ONNX_LOGE("%s", recognizer_config.ToString().c_str()); +#endif } return recognizer_config; @@ -507,14 +508,15 @@ static sherpa_onnx::OfflineRecognizerConfig GetOfflineRecognizerConfig( recognizer_config.hr.rule_fsts = SHERPA_ONNX_OR(config->hr.rule_fsts, ""); if (config->model_config.debug) { +#if __OHOS__ auto str_vec = sherpa_onnx::SplitString(recognizer_config.ToString(), 128); for (const auto &s : str_vec) { -#if __OHOS__ SHERPA_ONNX_LOGE("%{public}s\n", s.c_str()); -#else SHERPA_ONNX_LOGE("%s\n", s.c_str()); -#endif } +#else + SHERPA_ONNX_LOGE("%s", recognizer_config.ToString().c_str()); +#endif } return recognizer_config; diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index f704bfeb..99f7e9a2 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -55,6 +55,8 @@ set(sources offline-source-separation-model-config.cc offline-source-separation-spleeter-model-config.cc offline-source-separation-spleeter-model.cc + offline-source-separation-uvr-model-config.cc + offline-source-separation-uvr-model.cc offline-source-separation.cc offline-stream.cc diff --git a/sherpa-onnx/csrc/microphone.cc b/sherpa-onnx/csrc/microphone.cc index 0f749eda..b378b249 100644 --- a/sherpa-onnx/csrc/microphone.cc +++ b/sherpa-onnx/csrc/microphone.cc @@ -25,9 +25,7 @@ Microphone::~Microphone() { } } -int Microphone::GetDeviceCount() const { - return Pa_GetDeviceCount(); -} +int Microphone::GetDeviceCount() const { return Pa_GetDeviceCount(); } int Microphone::GetDefaultInputDevice() const { return Pa_GetDefaultInputDevice(); @@ -43,7 +41,8 @@ void Microphone::PrintDevices(int device_index) const { } } -bool Microphone::OpenDevice(int index, int sample_rate, int channel, PaStreamCallback cb, void* userdata) { +bool Microphone::OpenDevice(int index, int sample_rate, int channel, + PaStreamCallback cb, void *userdata) { if (index < 0 || index >= Pa_GetDeviceCount()) { fprintf(stderr, "Invalid device index: %d\n", index); return false; @@ -68,7 +67,8 @@ bool Microphone::OpenDevice(int index, int sample_rate, int channel, PaStreamCal param.suggestedLatency = info->defaultLowInputLatency; param.hostApiSpecificStreamInfo = nullptr; - PaError err = Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ + PaError err = + Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ sample_rate, 0, // frames per buffer paClipOff, // we won't output out of range samples diff --git a/sherpa-onnx/csrc/microphone.h b/sherpa-onnx/csrc/microphone.h index 8d651793..f3f9feb6 100644 --- a/sherpa-onnx/csrc/microphone.h +++ b/sherpa-onnx/csrc/microphone.h @@ -4,22 +4,27 @@ #ifndef SHERPA_ONNX_CSRC_MICROPHONE_H_ #define SHERPA_ONNX_CSRC_MICROPHONE_H_ -#include "portaudio.h" // NOLINT +#include +#include "portaudio.h" // NOLINT namespace sherpa_onnx { class Microphone { - PaStream *stream = nullptr; public: Microphone(); ~Microphone(); - int GetDeviceCount() const; - int GetDefaultInputDevice() const; - void PrintDevices(int sel) const; - - bool OpenDevice(int index, int sample_rate, int channel, PaStreamCallback cb, void* userdata); + int32_t GetDeviceCount() const; + int32_t GetDefaultInputDevice() const; + void PrintDevices(int32_t sel) const; + + bool OpenDevice(int32_t index, int32_t sample_rate, int32_t channel, + PaStreamCallback cb, void *userdata); + void CloseDevice(); + + private: + PaStream *stream = nullptr; }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-source-separation-impl.cc b/sherpa-onnx/csrc/offline-source-separation-impl.cc index 74e11d76..3a68ab2a 100644 --- a/sherpa-onnx/csrc/offline-source-separation-impl.cc +++ b/sherpa-onnx/csrc/offline-source-separation-impl.cc @@ -4,7 +4,9 @@ #include "sherpa-onnx/csrc/offline-source-separation-impl.h" +#include #include +#include #if __ANDROID_API__ >= 9 #include "android/asset_manager.h" @@ -16,22 +18,93 @@ #endif #include "sherpa-onnx/csrc/offline-source-separation-spleeter-impl.h" +#include "sherpa-onnx/csrc/offline-source-separation-uvr-impl.h" +#include "sherpa-onnx/csrc/resample.h" namespace sherpa_onnx { std::unique_ptr OfflineSourceSeparationImpl::Create( const OfflineSourceSeparationConfig &config) { - // TODO(fangjun): Support other models - return std::make_unique(config); + if (!config.model.spleeter.vocals.empty()) { + return std::make_unique(config); + } + + if (!config.model.uvr.model.empty()) { + return std::make_unique(config); + } + + SHERPA_ONNX_LOGE("Please provide a separation model!"); + + return nullptr; } template std::unique_ptr OfflineSourceSeparationImpl::Create( Manager *mgr, const OfflineSourceSeparationConfig &config) { - // TODO(fangjun): Support other models - return std::make_unique(mgr, config); + if (!config.model.spleeter.vocals.empty()) { + return std::make_unique(mgr, config); + } + + if (!config.model.uvr.model.empty()) { + return std::make_unique(mgr, config); + } + + SHERPA_ONNX_LOGE("Please provide a separation model!"); + + return nullptr; +} + +OfflineSourceSeparationInput OfflineSourceSeparationImpl::Resample( + const OfflineSourceSeparationInput &input, bool debug /*= false*/) const { + const OfflineSourceSeparationInput *p_input = &input; + OfflineSourceSeparationInput tmp_input; + + int32_t output_sample_rate = GetOutputSampleRate(); + + if (input.sample_rate != output_sample_rate) { + SHERPA_ONNX_LOGE( + "Creating a resampler:\n" + " in_sample_rate: %d\n" + " output_sample_rate: %d\n", + input.sample_rate, output_sample_rate); + + float min_freq = std::min(input.sample_rate, output_sample_rate); + float lowpass_cutoff = 0.99 * 0.5 * min_freq; + + int32_t lowpass_filter_width = 6; + auto resampler = + std::make_unique(input.sample_rate, output_sample_rate, + lowpass_cutoff, lowpass_filter_width); + + std::vector s; + for (const auto &samples : input.samples.data) { + resampler->Reset(); + resampler->Resample(samples.data(), samples.size(), true, &s); + tmp_input.samples.data.push_back(std::move(s)); + } + + tmp_input.sample_rate = output_sample_rate; + p_input = &tmp_input; + } + + if (p_input->samples.data.size() > 1) { + if (debug) { + SHERPA_ONNX_LOGE("input ch1 samples size: %d", + static_cast(p_input->samples.data[1].size())); + } + + if (p_input->samples.data[0].size() != p_input->samples.data[1].size()) { + SHERPA_ONNX_LOGE("ch0 samples size %d vs ch1 samples size %d", + static_cast(p_input->samples.data[0].size()), + static_cast(p_input->samples.data[1].size())); + + SHERPA_ONNX_EXIT(-1); + } + } + + return *p_input; } #if __ANDROID_API__ >= 9 diff --git a/sherpa-onnx/csrc/offline-source-separation-impl.h b/sherpa-onnx/csrc/offline-source-separation-impl.h index 8bb6852d..bd1f26c1 100644 --- a/sherpa-onnx/csrc/offline-source-separation-impl.h +++ b/sherpa-onnx/csrc/offline-source-separation-impl.h @@ -5,6 +5,7 @@ #ifndef SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_IMPL_H_ #define SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_IMPL_H_ +#include #include #include "sherpa-onnx/csrc/offline-source-separation.h" @@ -28,6 +29,9 @@ class OfflineSourceSeparationImpl { virtual int32_t GetOutputSampleRate() const = 0; virtual int32_t GetNumberOfStems() const = 0; + + OfflineSourceSeparationInput Resample( + const OfflineSourceSeparationInput &input, bool debug = false) const; }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-source-separation-model-config.cc b/sherpa-onnx/csrc/offline-source-separation-model-config.cc index dfd765d3..00dcbb8e 100644 --- a/sherpa-onnx/csrc/offline-source-separation-model-config.cc +++ b/sherpa-onnx/csrc/offline-source-separation-model-config.cc @@ -4,10 +4,13 @@ #include "sherpa-onnx/csrc/offline-source-separation-model-config.h" +#include "sherpa-onnx/csrc/macros.h" + namespace sherpa_onnx { void OfflineSourceSeparationModelConfig::Register(ParseOptions *po) { spleeter.Register(po); + uvr.Register(po); po->Register("num-threads", &num_threads, "Number of threads to run the neural network"); @@ -20,7 +23,17 @@ void OfflineSourceSeparationModelConfig::Register(ParseOptions *po) { } bool OfflineSourceSeparationModelConfig::Validate() const { - return spleeter.Validate(); + if (!spleeter.vocals.empty()) { + return spleeter.Validate(); + } + + if (!uvr.model.empty()) { + return uvr.Validate(); + } + + SHERPA_ONNX_LOGE("Please specify a source separation model"); + + return false; } std::string OfflineSourceSeparationModelConfig::ToString() const { @@ -28,6 +41,7 @@ std::string OfflineSourceSeparationModelConfig::ToString() const { os << "OfflineSourceSeparationModelConfig("; os << "spleeter=" << spleeter.ToString() << ", "; + os << "uvr=" << uvr.ToString() << ", "; os << "num_threads=" << num_threads << ", "; os << "debug=" << (debug ? "True" : "False") << ", "; os << "provider=\"" << provider << "\")"; diff --git a/sherpa-onnx/csrc/offline-source-separation-model-config.h b/sherpa-onnx/csrc/offline-source-separation-model-config.h index bf88d39d..7d07f703 100644 --- a/sherpa-onnx/csrc/offline-source-separation-model-config.h +++ b/sherpa-onnx/csrc/offline-source-separation-model-config.h @@ -8,12 +8,14 @@ #include #include "sherpa-onnx/csrc/offline-source-separation-spleeter-model-config.h" +#include "sherpa-onnx/csrc/offline-source-separation-uvr-model-config.h" #include "sherpa-onnx/csrc/parse-options.h" namespace sherpa_onnx { struct OfflineSourceSeparationModelConfig { OfflineSourceSeparationSpleeterModelConfig spleeter; + OfflineSourceSeparationUvrModelConfig uvr; int32_t num_threads = 1; bool debug = false; @@ -23,8 +25,10 @@ struct OfflineSourceSeparationModelConfig { OfflineSourceSeparationModelConfig( const OfflineSourceSeparationSpleeterModelConfig &spleeter, - int32_t num_threads, bool debug, const std::string &provider) + const OfflineSourceSeparationUvrModelConfig &uvr, int32_t num_threads, + bool debug, const std::string &provider) : spleeter(spleeter), + uvr(uvr), num_threads(num_threads), debug(debug), provider(provider) {} diff --git a/sherpa-onnx/csrc/offline-source-separation-spleeter-impl.h b/sherpa-onnx/csrc/offline-source-separation-spleeter-impl.h index 7a707c63..e48030eb 100644 --- a/sherpa-onnx/csrc/offline-source-separation-spleeter-impl.h +++ b/sherpa-onnx/csrc/offline-source-separation-spleeter-impl.h @@ -5,6 +5,10 @@ #ifndef SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_SPLEETER_IMPL_H_ #define SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_SPLEETER_IMPL_H_ +#include +#include +#include + #include "Eigen/Dense" #include "kaldi-native-fbank/csrc/istft.h" #include "kaldi-native-fbank/csrc/stft.h" @@ -12,13 +16,12 @@ #include "sherpa-onnx/csrc/offline-source-separation-spleeter-model.h" #include "sherpa-onnx/csrc/offline-source-separation.h" #include "sherpa-onnx/csrc/onnx-utils.h" -#include "sherpa-onnx/csrc/resample.h" namespace sherpa_onnx { class OfflineSourceSeparationSpleeterImpl : public OfflineSourceSeparationImpl { public: - OfflineSourceSeparationSpleeterImpl( + explicit OfflineSourceSeparationSpleeterImpl( const OfflineSourceSeparationConfig &config) : config_(config), model_(config_.model) {} @@ -28,56 +31,12 @@ class OfflineSourceSeparationSpleeterImpl : public OfflineSourceSeparationImpl { : config_(config), model_(mgr, config_.model) {} OfflineSourceSeparationOutput Process( - const OfflineSourceSeparationInput &input) const override { - const OfflineSourceSeparationInput *p_input = &input; - OfflineSourceSeparationInput tmp_input; + const OfflineSourceSeparationInput &_input) const override { + auto input = Resample(_input, config_.model.debug); - int32_t output_sample_rate = GetOutputSampleRate(); + auto stft_ch0 = ComputeStft(input, 0); - if (input.sample_rate != output_sample_rate) { - SHERPA_ONNX_LOGE( - "Creating a resampler:\n" - " in_sample_rate: %d\n" - " output_sample_rate: %d\n", - input.sample_rate, output_sample_rate); - - float min_freq = std::min(input.sample_rate, output_sample_rate); - float lowpass_cutoff = 0.99 * 0.5 * min_freq; - - int32_t lowpass_filter_width = 6; - auto resampler = std::make_unique( - input.sample_rate, output_sample_rate, lowpass_cutoff, - lowpass_filter_width); - - std::vector s; - for (const auto &samples : input.samples.data) { - resampler->Reset(); - resampler->Resample(samples.data(), samples.size(), true, &s); - tmp_input.samples.data.push_back(std::move(s)); - } - - tmp_input.sample_rate = output_sample_rate; - p_input = &tmp_input; - } - - if (p_input->samples.data.size() > 1) { - if (config_.model.debug) { - SHERPA_ONNX_LOGE("input ch1 samples size: %d", - static_cast(p_input->samples.data[1].size())); - } - - if (p_input->samples.data[0].size() != p_input->samples.data[1].size()) { - SHERPA_ONNX_LOGE("ch0 samples size %d vs ch1 samples size %d", - static_cast(p_input->samples.data[0].size()), - static_cast(p_input->samples.data[1].size())); - - SHERPA_ONNX_EXIT(-1); - } - } - - auto stft_ch0 = ComputeStft(*p_input, 0); - - auto stft_ch1 = ComputeStft(*p_input, 1); + auto stft_ch1 = ComputeStft(input, 1); knf::StftResult *p_stft_ch1 = stft_ch1.real.empty() ? &stft_ch0 : &stft_ch1; int32_t num_frames = stft_ch0.num_frames; @@ -261,7 +220,6 @@ class OfflineSourceSeparationSpleeterImpl : public OfflineSourceSeparationImpl { stft_config.win_length = meta.window_length; stft_config.window_type = meta.window_type; stft_config.center = meta.center; - stft_config.center = false; return stft_config; } diff --git a/sherpa-onnx/csrc/offline-source-separation-spleeter-model-config.cc b/sherpa-onnx/csrc/offline-source-separation-spleeter-model-config.cc index c43f693f..0dc3ee6e 100644 --- a/sherpa-onnx/csrc/offline-source-separation-spleeter-model-config.cc +++ b/sherpa-onnx/csrc/offline-source-separation-spleeter-model-config.cc @@ -1,4 +1,4 @@ -// sherpa-onnx/csrc/offline-source-separation-spleeter_model-config.cc +// sherpa-onnx/csrc/offline-source-separation-spleeter-model-config.cc // // Copyright (c) 2025 Xiaomi Corporation diff --git a/sherpa-onnx/csrc/offline-source-separation-spleeter-model-config.h b/sherpa-onnx/csrc/offline-source-separation-spleeter-model-config.h index 7e868966..18d4cd3e 100644 --- a/sherpa-onnx/csrc/offline-source-separation-spleeter-model-config.h +++ b/sherpa-onnx/csrc/offline-source-separation-spleeter-model-config.h @@ -1,4 +1,4 @@ -// sherpa-onnx/csrc/offline-source-separation-spleeter_model-config.h +// sherpa-onnx/csrc/offline-source-separation-spleeter-model-config.h // // Copyright (c) 2025 Xiaomi Corporation diff --git a/sherpa-onnx/csrc/offline-source-separation-spleeter-model-meta-data.h b/sherpa-onnx/csrc/offline-source-separation-spleeter-model-meta-data.h index 31b214cb..3db4c44d 100644 --- a/sherpa-onnx/csrc/offline-source-separation-spleeter-model-meta-data.h +++ b/sherpa-onnx/csrc/offline-source-separation-spleeter-model-meta-data.h @@ -1,6 +1,6 @@ // sherpa-onnx/csrc/offline-source-separation-spleeter-model-meta-data.h // -// Copyright (c) 2024 Xiaomi Corporation +// Copyright (c) 2025 Xiaomi Corporation #ifndef SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_SPLEETER_MODEL_META_DATA_H_ #define SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_SPLEETER_MODEL_META_DATA_H_ diff --git a/sherpa-onnx/csrc/offline-source-separation-uvr-impl.h b/sherpa-onnx/csrc/offline-source-separation-uvr-impl.h new file mode 100644 index 00000000..a762cee3 --- /dev/null +++ b/sherpa-onnx/csrc/offline-source-separation-uvr-impl.h @@ -0,0 +1,382 @@ +// sherpa-onnx/csrc/offline-source-separation-uvr-impl.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_IMPL_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_IMPL_H_ + +#include +#include +#include + +#include "Eigen/Dense" +#include "kaldi-native-fbank/csrc/istft.h" +#include "kaldi-native-fbank/csrc/stft.h" +#include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/csrc/offline-source-separation-uvr-model.h" +#include "sherpa-onnx/csrc/offline-source-separation.h" +#include "sherpa-onnx/csrc/onnx-utils.h" +#include "sherpa-onnx/csrc/resample.h" + +namespace sherpa_onnx { + +class OfflineSourceSeparationUvrImpl : public OfflineSourceSeparationImpl { + public: + explicit OfflineSourceSeparationUvrImpl( + const OfflineSourceSeparationConfig &config) + : config_(config), model_(config_.model) {} + + template + OfflineSourceSeparationUvrImpl(Manager *mgr, + const OfflineSourceSeparationConfig &config) + : config_(config), model_(mgr, config_.model) {} + + OfflineSourceSeparationOutput Process( + const OfflineSourceSeparationInput &_input) const override { + auto input = Resample(_input, config_.model.debug); + + auto chunks_ch0 = SplitIntoChunks(input.samples.data[0]); + + std::vector> chunks_ch1; + if (input.samples.data.size() > 1) { + chunks_ch1 = SplitIntoChunks(input.samples.data[1]); + } + + std::vector samples_ch0; + std::vector samples_ch1; + + for (int32_t i = 0; i != static_cast(chunks_ch0.size()); ++i) { + bool is_first_chunk = (i == 0); + bool is_last_chunk = (i == static_cast(chunks_ch0.size()) - 1); + + auto s = ProcessChunk( + chunks_ch0[i], + chunks_ch1.empty() ? std::vector{} : chunks_ch1[i], + is_first_chunk, is_last_chunk); + + samples_ch0.insert(samples_ch0.end(), s.first.begin(), s.first.end()); + samples_ch1.insert(samples_ch1.end(), s.second.begin(), s.second.end()); + } + + auto &vocals_ch0 = samples_ch0; + auto &vocals_ch1 = samples_ch1; + + std::vector non_vocals_ch0(vocals_ch0.size()); + std::vector non_vocals_ch1(vocals_ch1.size()); + + Eigen::Map(non_vocals_ch0.data(), non_vocals_ch0.size()) = + Eigen::Map(input.samples.data[0].data(), + input.samples.data[0].size()) + .array() - + Eigen::Map(vocals_ch0.data(), vocals_ch0.size()) + .array(); + + if (input.samples.data.size() > 1) { + Eigen::Map(non_vocals_ch1.data(), + non_vocals_ch1.size()) = + Eigen::Map(input.samples.data[1].data(), + input.samples.data[1].size()) + .array() - + Eigen::Map(vocals_ch1.data(), vocals_ch1.size()) + .array(); + } else { + Eigen::Map(non_vocals_ch1.data(), + non_vocals_ch1.size()) = + Eigen::Map(input.samples.data[0].data(), + input.samples.data[0].size()) + .array() - + Eigen::Map(vocals_ch1.data(), vocals_ch1.size()) + .array(); + } + + OfflineSourceSeparationOutput ans; + ans.sample_rate = GetOutputSampleRate(); + + ans.stems.resize(2); + ans.stems[0].data.reserve(2); + ans.stems[1].data.reserve(2); + + ans.stems[0].data.push_back(std::move(vocals_ch0)); + ans.stems[0].data.push_back(std::move(vocals_ch1)); + + ans.stems[1].data.push_back(std::move(non_vocals_ch0)); + ans.stems[1].data.push_back(std::move(non_vocals_ch1)); + + return ans; + } + + int32_t GetOutputSampleRate() const override { + return model_.GetMetaData().sample_rate; + } + + int32_t GetNumberOfStems() const override { + return model_.GetMetaData().num_stems; + } + + private: + std::pair, std::vector> ProcessChunk( + const std::vector &chunk_ch0, const std::vector &chunk_ch1, + bool is_first_chunk, bool is_last_chunk) const { + int32_t pad0 = 0; + + auto stft_results_ch0 = ComputeStft(chunk_ch0, &pad0); + + int32_t pad1 = pad0; + std::vector stft_results_ch1; + + if (!chunk_ch1.empty()) { + stft_results_ch1 = ComputeStft(chunk_ch1, &pad1); + } else { + stft_results_ch1 = stft_results_ch0; + } + + const auto &meta_ = model_.GetMetaData(); + + int32_t num_frames = stft_results_ch0[0].num_frames; + int32_t dim_f = meta_.dim_f; + int32_t dim_t = meta_.dim_t; + int32_t n_fft_bin = meta_.n_fft / 2 + 1; + if (num_frames != dim_t) { + SHERPA_ONNX_LOGE("num_frames(%d) != dim_t(%d)", num_frames, dim_t); + SHERPA_ONNX_EXIT(-1); + } + + // the first 2: number of channels + // the second 2: real and image + std::vector x(stft_results_ch0.size() * 2 * 2 * dim_f * dim_t); + float *px = x.data(); + + for (int32_t i = 0; i != static_cast(stft_results_ch0.size()); + ++i) { + const auto &ch0 = stft_results_ch0[i]; + const auto &ch1 = stft_results_ch1[i]; + + const float *p_real_ch0 = ch0.real.data(); + const float *p_imag_ch0 = ch0.imag.data(); + + const float *p_real_ch1 = ch1.real.data(); + const float *p_imag_ch1 = ch1.imag.data(); + + for (int32_t j = 0; j != dim_f; ++j) { + for (int32_t k = 0; k != num_frames; ++k) { + *px = p_real_ch0[k * n_fft_bin + j]; + ++px; + } + } + + for (int32_t j = 0; j != dim_f; ++j) { + for (int32_t k = 0; k != num_frames; ++k) { + *px = p_imag_ch0[k * n_fft_bin + j]; + ++px; + } + } + + for (int32_t j = 0; j != dim_f; ++j) { + for (int32_t k = 0; k != num_frames; ++k) { + *px = p_real_ch1[k * n_fft_bin + j]; + ++px; + } + } + + for (int32_t j = 0; j != dim_f; ++j) { + for (int32_t k = 0; k != num_frames; ++k) { + *px = p_imag_ch1[k * n_fft_bin + j]; + ++px; + } + } + } // for (int32_t i = 0; i != + + auto memory_info = + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault); + + std::array x_shape{ + static_cast(stft_results_ch0.size()) * 4 / meta_.dim_c, + meta_.dim_c, dim_f, dim_t}; + + Ort::Value x_tensor = Ort::Value::CreateTensor( + memory_info, x.data(), x.size(), x_shape.data(), x_shape.size()); + + Ort::Value spec = model_.Run(std::move(x_tensor)); + + const float *p_spec = spec.GetTensorData(); + + for (int32_t i = 0; i != static_cast(stft_results_ch0.size()); + ++i) { + auto &ch0 = stft_results_ch0[i]; + auto &ch1 = stft_results_ch1[i]; + + float *p_real_ch0 = ch0.real.data(); + float *p_imag_ch0 = ch0.imag.data(); + + float *p_real_ch1 = ch1.real.data(); + float *p_imag_ch1 = ch1.imag.data(); + + for (int32_t j = 0; j != dim_f; ++j) { + for (int32_t k = 0; k != num_frames; ++k) { + p_real_ch0[k * n_fft_bin + j] = *p_spec; + ++p_spec; + } + } + + for (int32_t j = 0; j != dim_f; ++j) { + for (int32_t k = 0; k != num_frames; ++k) { + p_imag_ch0[k * n_fft_bin + j] = *p_spec; + ++p_spec; + } + } + + for (int32_t j = 0; j != dim_f; ++j) { + for (int32_t k = 0; k != num_frames; ++k) { + p_real_ch1[k * n_fft_bin + j] = *p_spec; + ++p_spec; + } + } + + for (int32_t j = 0; j != dim_f; ++j) { + for (int32_t k = 0; k != num_frames; ++k) { + p_imag_ch1[k * n_fft_bin + j] = *p_spec; + ++p_spec; + } + } + + for (int32_t k = 0; k != num_frames; ++k) { + for (int32_t j = dim_f; j != n_fft_bin; ++j) { + p_real_ch0[k * n_fft_bin + j] = 0; + p_real_ch1[k * n_fft_bin + j] = 0; + + p_imag_ch0[k * n_fft_bin + j] = 0; + p_imag_ch1[k * n_fft_bin + j] = 0; + } + } + } + + auto samples_ch0 = ComputeInverseStft(stft_results_ch0, pad0, + is_first_chunk, is_last_chunk); + + auto samples_ch1 = ComputeInverseStft(stft_results_ch1, pad1, + is_first_chunk, is_last_chunk); + + return {std::move(samples_ch0), std::move(samples_ch1)}; + } + + std::vector ComputeInverseStft( + const std::vector &stft_result, int32_t pad, + bool is_first_chunk, bool is_last_chunk) const { + const auto &meta_ = model_.GetMetaData(); + int32_t trim = meta_.n_fft / 2; + + int32_t margin = meta_.margin; + + int32_t chunk_size = meta_.num_chunks * meta_.sample_rate; + + if (margin > chunk_size) { + margin = chunk_size; + } + + auto stft_config = GetStftConfig(); + knf::IStft istft(stft_config); + + std::vector ans; + + for (int32_t i = 0; i != static_cast(stft_result.size()); ++i) { + auto samples = istft.Compute(stft_result[i]); + int32_t num_samples = static_cast(samples.size()); + + ans.insert(ans.end(), samples.begin() + trim, + samples.begin() + (num_samples - trim)); + } + + int32_t start = is_first_chunk ? 0 : margin; + int32_t end = + is_last_chunk ? (ans.size() - pad) : (ans.size() - pad - margin); + + return {ans.begin() + start, ans.begin() + end}; + } + + std::vector ComputeStft(const std::vector &chunk, + int32_t *pad) const { + const auto &meta_ = model_.GetMetaData(); + + int32_t num_samples = static_cast(chunk.size()); + int32_t trim = meta_.n_fft / 2; + int32_t chunk_size = meta_.hop_length * (meta_.dim_t - 1); + int32_t gen_size = chunk_size - 2 * trim; + *pad = gen_size - num_samples % gen_size; + + std::vector samples(trim + chunk.size() + *pad + trim); + std::copy(chunk.begin(), chunk.end(), samples.begin() + trim); + + auto stft_config = GetStftConfig(); + knf::Stft stft(stft_config); + + std::vector stft_results; + // split the chunk into short segments + for (int32_t i = 0; i < num_samples + *pad; i += gen_size) { + auto r = stft.Compute(samples.data() + i, chunk_size); + stft_results.push_back(std::move(r)); + } + + return stft_results; + } + + std::vector> SplitIntoChunks( + const std::vector &samples) const { + std::vector> ans; + + if (samples.empty()) { + return ans; + } + + const auto &meta_ = model_.GetMetaData(); + int32_t margin = meta_.margin; + + int32_t chunk_size = meta_.num_chunks * meta_.sample_rate; + + if (static_cast(samples.size()) < chunk_size) { + chunk_size = samples.size(); + } + + if (margin > chunk_size) { + margin = chunk_size; + } + + for (int32_t i = 0; i < static_cast(samples.size()); + i += chunk_size) { + int32_t start = std::max(0, i - margin); + int32_t end = std::min(i + chunk_size + margin, + static_cast(samples.size())); + if (start >= end) { + break; + } + + ans.emplace_back(samples.begin() + start, samples.begin() + end); + + if (end == static_cast(samples.size())) { + break; + } + } + + return ans; + } + + knf::StftConfig GetStftConfig() const { + const auto &meta = model_.GetMetaData(); + + knf::StftConfig stft_config; + stft_config.n_fft = meta.n_fft; + stft_config.hop_length = meta.hop_length; + stft_config.win_length = meta.window_length; + stft_config.window_type = meta.window_type; + stft_config.center = meta.center; + + return stft_config; + } + + private: + OfflineSourceSeparationConfig config_; + OfflineSourceSeparationUvrModel model_; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_IMPL_H_ diff --git a/sherpa-onnx/csrc/offline-source-separation-uvr-model-config.cc b/sherpa-onnx/csrc/offline-source-separation-uvr-model-config.cc new file mode 100644 index 00000000..f95ea307 --- /dev/null +++ b/sherpa-onnx/csrc/offline-source-separation-uvr-model-config.cc @@ -0,0 +1,39 @@ +// sherpa-onnx/csrc/offline-source-separation-uvr-model-config.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa-onnx/csrc/offline-source-separation-uvr-model-config.h" + +#include "sherpa-onnx/csrc/file-utils.h" +#include "sherpa-onnx/csrc/macros.h" + +namespace sherpa_onnx { + +void OfflineSourceSeparationUvrModelConfig::Register(ParseOptions *po) { + po->Register("uvr-model", &model, "Path to the UVR model"); +} + +bool OfflineSourceSeparationUvrModelConfig::Validate() const { + if (model.empty()) { + SHERPA_ONNX_LOGE("Please provide --uvr-model"); + return false; + } + + if (!FileExists(model)) { + SHERPA_ONNX_LOGE("UVR model '%s' does not exist. ", model.c_str()); + return false; + } + + return true; +} + +std::string OfflineSourceSeparationUvrModelConfig::ToString() const { + std::ostringstream os; + + os << "OfflineSourceSeparationUvrModelConfig("; + os << "model=\"" << model << "\")"; + + return os.str(); +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-source-separation-uvr-model-config.h b/sherpa-onnx/csrc/offline-source-separation-uvr-model-config.h new file mode 100644 index 00000000..21cad76a --- /dev/null +++ b/sherpa-onnx/csrc/offline-source-separation-uvr-model-config.h @@ -0,0 +1,32 @@ +// sherpa-onnx/csrc/offline-source-separation-uvr-model-config.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_CONFIG_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_CONFIG_H_ + +#include + +#include "sherpa-onnx/csrc/offline-source-separation-uvr-model-config.h" +#include "sherpa-onnx/csrc/parse-options.h" + +namespace sherpa_onnx { + +struct OfflineSourceSeparationUvrModelConfig { + std::string model; + + OfflineSourceSeparationUvrModelConfig() = default; + + explicit OfflineSourceSeparationUvrModelConfig(const std::string &model) + : model(model) {} + + void Register(ParseOptions *po); + + bool Validate() const; + + std::string ToString() const; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_CONFIG_H_ diff --git a/sherpa-onnx/csrc/offline-source-separation-uvr-model-meta-data.h b/sherpa-onnx/csrc/offline-source-separation-uvr-model-meta-data.h new file mode 100644 index 00000000..79a6280d --- /dev/null +++ b/sherpa-onnx/csrc/offline-source-separation-uvr-model-meta-data.h @@ -0,0 +1,38 @@ +// sherpa-onnx/csrc/offline-source-separation-uvr-model-meta-data.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_META_DATA_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_META_DATA_H_ + +#include +#include +#include + +namespace sherpa_onnx { + +// See also +// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/uvr_mdx/test.py +// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/uvr_mdx/add_meta_data_and_quantize.py +struct OfflineSourceSeparationUvrModelMetaData { + int32_t sample_rate = 44100; + int32_t num_stems = 2; + int32_t dim_c = -1; + int32_t dim_f = -1; + int32_t dim_t = -1; + + int32_t n_fft = -1; + int32_t hop_length = 1024; + + int32_t window_length = -1; + int32_t center = 1; + std::string window_type = "hann"; + + // the following fields are preconfigured. Please see + // https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/uvr_mdx/test.py + int32_t margin = 0; // changed in ./offline-source-separation-uvr-model.cc + const int32_t num_chunks = 15; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_META_DATA_H_ diff --git a/sherpa-onnx/csrc/offline-source-separation-uvr-model.cc b/sherpa-onnx/csrc/offline-source-separation-uvr-model.cc new file mode 100644 index 00000000..c69261fd --- /dev/null +++ b/sherpa-onnx/csrc/offline-source-separation-uvr-model.cc @@ -0,0 +1,172 @@ +// sherpa-onnx/csrc/offline-source-separation-uvr-model.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa-onnx/csrc/offline-source-separation-uvr-model.h" + +#include +#include +#include +#include + +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + +#if __OHOS__ +#include "rawfile/raw_file_manager.h" +#endif + +#include "sherpa-onnx/csrc/file-utils.h" +#include "sherpa-onnx/csrc/onnx-utils.h" +#include "sherpa-onnx/csrc/session.h" +#include "sherpa-onnx/csrc/text-utils.h" + +namespace sherpa_onnx { + +class OfflineSourceSeparationUvrModel::Impl { + public: + explicit Impl(const OfflineSourceSeparationModelConfig &config) + : config_(config), + env_(ORT_LOGGING_LEVEL_ERROR), + sess_opts_(GetSessionOptions(config)), + allocator_{} { + auto buf = ReadFile(config.uvr.model); + Init(buf.data(), buf.size()); + } + + template + Impl(Manager *mgr, const OfflineSourceSeparationModelConfig &config) + : config_(config), + env_(ORT_LOGGING_LEVEL_ERROR), + sess_opts_(GetSessionOptions(config)), + allocator_{} { + auto buf = ReadFile(mgr, config.uvr.model); + Init(buf.data(), buf.size()); + } + + const OfflineSourceSeparationUvrModelMetaData &GetMetaData() const { + return meta_; + } + + Ort::Value Run(Ort::Value x) const { + auto out = sess_->Run({}, input_names_ptr_.data(), &x, 1, + output_names_ptr_.data(), output_names_ptr_.size()); + return std::move(out[0]); + } + + private: + void Init(void *model_data, size_t model_data_length) { + sess_ = std::make_unique(env_, model_data, model_data_length, + sess_opts_); + + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_); + + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_); + + Ort::ModelMetadata meta_data = sess_->GetModelMetadata(); + if (config_.debug) { + std::ostringstream os; + os << "---UVR model---\n"; + PrintModelMetadata(os, meta_data); + + os << "----------input names----------\n"; + int32_t i = 0; + for (const auto &s : input_names_) { + os << i << " " << s << "\n"; + ++i; + } + os << "----------output names----------\n"; + i = 0; + for (const auto &s : output_names_) { + os << i << " " << s << "\n"; + ++i; + } + +#if __OHOS__ + SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str()); +#else + SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); +#endif + } + + Ort::AllocatorWithDefaultOptions allocator; // used in the macro below + + std::string model_type; + SHERPA_ONNX_READ_META_DATA_STR(model_type, "model_type"); + if (model_type != "UVR") { + SHERPA_ONNX_LOGE("Expect model type 'UVR'. Given: '%s'", + model_type.c_str()); + SHERPA_ONNX_EXIT(-1); + } + + SHERPA_ONNX_READ_META_DATA(meta_.num_stems, "stems"); + if (meta_.num_stems != 2) { + SHERPA_ONNX_LOGE("Only 2stems is supported. Given %d stems", + meta_.num_stems); + SHERPA_ONNX_EXIT(-1); + } + + SHERPA_ONNX_READ_META_DATA(meta_.sample_rate, "sample_rate"); + SHERPA_ONNX_READ_META_DATA(meta_.n_fft, "n_fft"); + SHERPA_ONNX_READ_META_DATA(meta_.center, "center"); + SHERPA_ONNX_READ_META_DATA(meta_.window_length, "win_length"); + SHERPA_ONNX_READ_META_DATA(meta_.hop_length, "hop_length"); + SHERPA_ONNX_READ_META_DATA(meta_.dim_t, "dim_t"); + SHERPA_ONNX_READ_META_DATA(meta_.dim_f, "dim_f"); + SHERPA_ONNX_READ_META_DATA(meta_.dim_c, "dim_c"); + SHERPA_ONNX_READ_META_DATA_STR(meta_.window_type, "window_type"); + + meta_.margin = meta_.sample_rate; + } + + private: + OfflineSourceSeparationModelConfig config_; + OfflineSourceSeparationUvrModelMetaData meta_; + + Ort::Env env_; + Ort::SessionOptions sess_opts_; + Ort::AllocatorWithDefaultOptions allocator_; + + std::unique_ptr sess_; + + std::vector input_names_; + std::vector input_names_ptr_; + + std::vector output_names_; + std::vector output_names_ptr_; +}; + +OfflineSourceSeparationUvrModel::~OfflineSourceSeparationUvrModel() = default; + +OfflineSourceSeparationUvrModel::OfflineSourceSeparationUvrModel( + const OfflineSourceSeparationModelConfig &config) + : impl_(std::make_unique(config)) {} + +template +OfflineSourceSeparationUvrModel::OfflineSourceSeparationUvrModel( + Manager *mgr, const OfflineSourceSeparationModelConfig &config) + : impl_(std::make_unique(mgr, config)) {} + +Ort::Value OfflineSourceSeparationUvrModel::Run(Ort::Value x) const { + return impl_->Run(std::move(x)); +} + +const OfflineSourceSeparationUvrModelMetaData & +OfflineSourceSeparationUvrModel::GetMetaData() const { + return impl_->GetMetaData(); +} + +#if __ANDROID_API__ >= 9 +template OfflineSourceSeparationUvrModel::OfflineSourceSeparationUvrModel( + AAssetManager *mgr, const OfflineSourceSeparationModelConfig &config); +#endif + +#if __OHOS__ +template OfflineSourceSeparationUvrModel::OfflineSourceSeparationUvrModel( + NativeResourceManager *mgr, + const OfflineSourceSeparationModelConfig &config); +#endif + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-source-separation-uvr-model.h b/sherpa-onnx/csrc/offline-source-separation-uvr-model.h new file mode 100644 index 00000000..8ecf966a --- /dev/null +++ b/sherpa-onnx/csrc/offline-source-separation-uvr-model.h @@ -0,0 +1,36 @@ +// sherpa-onnx/csrc/offline-source-separation-uvr-model.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_H_ +#include + +#include "onnxruntime_cxx_api.h" // NOLINT +#include "sherpa-onnx/csrc/offline-source-separation-model-config.h" +#include "sherpa-onnx/csrc/offline-source-separation-uvr-model-meta-data.h" + +namespace sherpa_onnx { + +class OfflineSourceSeparationUvrModel { + public: + ~OfflineSourceSeparationUvrModel(); + + explicit OfflineSourceSeparationUvrModel( + const OfflineSourceSeparationModelConfig &config); + + template + OfflineSourceSeparationUvrModel( + Manager *mgr, const OfflineSourceSeparationModelConfig &config); + + Ort::Value Run(Ort::Value x) const; + + const OfflineSourceSeparationUvrModelMetaData &GetMetaData() const; + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_OFFLINE_SOURCE_SEPARATION_UVR_MODEL_H_ diff --git a/sherpa-onnx/csrc/offline-source-separation.h b/sherpa-onnx/csrc/offline-source-separation.h index dc9e82a5..6d1e2ff2 100644 --- a/sherpa-onnx/csrc/offline-source-separation.h +++ b/sherpa-onnx/csrc/offline-source-separation.h @@ -19,7 +19,8 @@ struct OfflineSourceSeparationConfig { OfflineSourceSeparationConfig() = default; - OfflineSourceSeparationConfig(const OfflineSourceSeparationModelConfig &model) + explicit OfflineSourceSeparationConfig( + const OfflineSourceSeparationModelConfig &model) : model(model) {} void Register(ParseOptions *po); @@ -54,7 +55,7 @@ class OfflineSourceSeparation { public: ~OfflineSourceSeparation(); - OfflineSourceSeparation(const OfflineSourceSeparationConfig &config); + explicit OfflineSourceSeparation(const OfflineSourceSeparationConfig &config); template OfflineSourceSeparation(Manager *mgr, diff --git a/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-microphone.cc b/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-microphone.cc index 4eed37c4..dcb8e8ab 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-microphone.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-microphone.cc @@ -101,8 +101,8 @@ for a list of pre-trained models to download. mic_sample_rate = atof(pSampleRateStr); } - if(!mic.OpenDevice(device_index, mic_sample_rate, 1, - RecordCallback, s.get())) { + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, + s.get())) { fprintf(stderr, "portaudio error: %d\n", device_index); exit(EXIT_FAILURE); } diff --git a/sherpa-onnx/csrc/sherpa-onnx-microphone-offline-audio-tagging.cc b/sherpa-onnx/csrc/sherpa-onnx-microphone-offline-audio-tagging.cc index 3dc1153b..b94ba09a 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-microphone-offline-audio-tagging.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-microphone-offline-audio-tagging.cc @@ -142,8 +142,8 @@ for more models. mic_sample_rate = atof(pSampleRateStr); } - if (!mic.OpenDevice(device_index, mic_sample_rate, 1, - RecordCallback, nullptr /* user_data */)){ + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, + nullptr /* user_data */)) { fprintf(stderr, "portaudio error: %d\n", device_index); exit(EXIT_FAILURE); } diff --git a/sherpa-onnx/csrc/sherpa-onnx-microphone-offline-speaker-identification.cc b/sherpa-onnx/csrc/sherpa-onnx-microphone-offline-speaker-identification.cc index 0d5c6322..3d22f0d0 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-microphone-offline-speaker-identification.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-microphone-offline-speaker-identification.cc @@ -244,8 +244,8 @@ Note that `zh` means Chinese, while `en` means English. mic_sample_rate = atof(pSampleRateStr); } - if (!mic.OpenDevice(device_index, mic_sample_rate, 1, - RecordCallback, nullptr /* user_data */)){ + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, + nullptr /* user_data */)) { fprintf(stderr, "portaudio error: %d\n", device_index); exit(EXIT_FAILURE); } diff --git a/sherpa-onnx/csrc/sherpa-onnx-microphone-offline.cc b/sherpa-onnx/csrc/sherpa-onnx-microphone-offline.cc index 4d00b9ba..5a012a8b 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-microphone-offline.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-microphone-offline.cc @@ -159,8 +159,8 @@ for a list of pre-trained models to download. mic_sample_rate = atof(pSampleRateStr); } - if (!mic.OpenDevice(device_index, mic_sample_rate, 1, - RecordCallback, nullptr /* user_data */)){ + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, + nullptr /* user_data */)) { fprintf(stderr, "portaudio error: %d\n", device_index); exit(EXIT_FAILURE); } diff --git a/sherpa-onnx/csrc/sherpa-onnx-microphone.cc b/sherpa-onnx/csrc/sherpa-onnx-microphone.cc index c5b13bf7..a3c1294a 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-microphone.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-microphone.cc @@ -129,8 +129,8 @@ for a list of pre-trained models to download. mic_sample_rate = atof(pSampleRateStr); } - if (!mic.OpenDevice(device_index, mic_sample_rate, 1, - RecordCallback, nullptr /* user_data */)){ + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, + nullptr /* user_data */)) { fprintf(stderr, "portaudio error: %d\n", device_index); exit(EXIT_FAILURE); } diff --git a/sherpa-onnx/csrc/sherpa-onnx-offline-source-separation.cc b/sherpa-onnx/csrc/sherpa-onnx-offline-source-separation.cc index 8af94aa1..86e9d6dc 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-offline-source-separation.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-offline-source-separation.cc @@ -33,6 +33,17 @@ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-m --input-wav=audio_example.wav \ --output-vocals-wav=output_vocals.wav \ --output-accompaniment-wav=output_accompaniment.wav + +(2) Use UVR models + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/UVR_MDXNET_1_9703.onnx +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/audio_example.wav + +./bin/sherpa-onnx-offline-source-separation \ + --uvr-model=./UVR_MDXNET_1_9703.onnx \ + --input-wav=audio_example.wav \ + --output-vocals-wav=output_vocals.wav \ + --output-accompaniment-wav=output_accompaniment.wav )usage"; sherpa_onnx::ParseOptions po(kUsageMessage); diff --git a/sherpa-onnx/csrc/sherpa-onnx-vad-microphone-offline-asr.cc b/sherpa-onnx/csrc/sherpa-onnx-vad-microphone-offline-asr.cc index 2ae17cbc..b3a4b355 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-vad-microphone-offline-asr.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-vad-microphone-offline-asr.cc @@ -136,7 +136,8 @@ to download models for offline ASR. mic_sample_rate = atof(pSampleRateStr); } - if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, nullptr)) { + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, + nullptr)) { fprintf(stderr, "Failed to open device %d\n", device_index); exit(EXIT_FAILURE); } diff --git a/sherpa-onnx/csrc/sherpa-onnx-vad-microphone.cc b/sherpa-onnx/csrc/sherpa-onnx-vad-microphone.cc index e6360063..31d98c0d 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-vad-microphone.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-vad-microphone.cc @@ -74,7 +74,6 @@ wget https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/siler sherpa_onnx::Microphone mic; - int32_t device_index = Pa_GetDefaultInputDevice(); if (device_index == paNoDevice) { fprintf(stderr, "No default input device found\n"); @@ -96,7 +95,8 @@ wget https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/siler fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); mic_sample_rate = atof(pSampleRateStr); } - if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, nullptr)) { + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, + nullptr)) { fprintf(stderr, "Failed to open microphone device %d\n", device_index); exit(EXIT_FAILURE); } diff --git a/sherpa-onnx/jni/offline-tts.cc b/sherpa-onnx/jni/offline-tts.cc index 4a11d064..be0a2634 100644 --- a/sherpa-onnx/jni/offline-tts.cc +++ b/sherpa-onnx/jni/offline-tts.cc @@ -5,6 +5,7 @@ #include "sherpa-onnx/csrc/offline-tts.h" #include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/csrc/text-utils.h" #include "sherpa-onnx/csrc/wave-writer.h" #include "sherpa-onnx/jni/common.h" @@ -207,7 +208,10 @@ JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_newFromAsset( } #endif auto config = sherpa_onnx::GetOfflineTtsConfig(env, _config); - SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str()); + auto str_vec = sherpa_onnx::SplitString(config.ToString(), 128); + for (const auto &s : str_vec) { + SHERPA_ONNX_LOGE("%s", s.c_str()); + } auto tts = new sherpa_onnx::OfflineTts( #if __ANDROID_API__ >= 9