Play generated audio using alsa for TTS (#482)
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
|
||||
project(sherpa-onnx)
|
||||
|
||||
set(SHERPA_ONNX_VERSION "1.9.3")
|
||||
set(SHERPA_ONNX_VERSION "1.9.4")
|
||||
|
||||
# Disable warning about
|
||||
#
|
||||
@@ -106,10 +106,23 @@ endif()
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}")
|
||||
|
||||
|
||||
include(CheckIncludeFileCXX)
|
||||
check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA)
|
||||
if(SHERPA_ONNX_HAS_ALSA)
|
||||
add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1)
|
||||
|
||||
if(UNIX AND NOT APPLE)
|
||||
check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA)
|
||||
if(SHERPA_ONNX_HAS_ALSA)
|
||||
add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1)
|
||||
else()
|
||||
message(WARNING "\
|
||||
Could not find alsa/asoundlib.h !
|
||||
We won't build sherpa-ncnn-alsa
|
||||
To fix that, please do:
|
||||
(1) sudo apt-get install alsa-utils libasound2-dev
|
||||
(2) rm -rf build
|
||||
(3) re-try
|
||||
")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
check_include_file_cxx(cxxabi.h SHERPA_ONNX_HAVE_CXXABI_H)
|
||||
|
||||
@@ -144,6 +144,8 @@ class BuildExtension(build_ext):
|
||||
binaries += ["sherpa-onnx-vad-microphone-offline-asr"]
|
||||
binaries += ["sherpa-onnx-offline-tts"]
|
||||
binaries += ["sherpa-onnx-offline-tts-play"]
|
||||
binaries += ["sherpa-onnx-alsa"]
|
||||
binaries += ["sherpa-onnx-offline-tts-play-alsa"]
|
||||
|
||||
if is_windows():
|
||||
binaries += ["kaldi-native-fbank-core.dll"]
|
||||
@@ -165,6 +167,11 @@ class BuildExtension(build_ext):
|
||||
src_file = install_dir / "lib" / (f + suffix)
|
||||
if not src_file.is_file():
|
||||
src_file = install_dir / ".." / (f + suffix)
|
||||
|
||||
if not src_file.is_file() and 'alsa' in f:
|
||||
print(f'Skipping {f}')
|
||||
continue
|
||||
|
||||
print(f"Copying {src_file} to {out_bin_dir}/")
|
||||
shutil.copy(f"{src_file}", f"{out_bin_dir}/")
|
||||
|
||||
|
||||
2
setup.py
2
setup.py
@@ -60,6 +60,8 @@ def get_binaries_to_install():
|
||||
binaries += ["sherpa-onnx-vad-microphone-offline-asr"]
|
||||
binaries += ["sherpa-onnx-offline-tts"]
|
||||
binaries += ["sherpa-onnx-offline-tts-play"]
|
||||
binaries += ["sherpa-onnx-alsa"]
|
||||
binaries += ["sherpa-onnx-offline-tts-play-alsa"]
|
||||
if is_windows():
|
||||
binaries += ["kaldi-native-fbank-core.dll"]
|
||||
binaries += ["sherpa-onnx-c-api.dll"]
|
||||
|
||||
@@ -207,14 +207,42 @@ install(
|
||||
|
||||
if(SHERPA_ONNX_HAS_ALSA)
|
||||
add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc)
|
||||
target_link_libraries(sherpa-onnx-alsa sherpa-onnx-core)
|
||||
add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc)
|
||||
|
||||
if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
|
||||
target_link_libraries(sherpa-onnx-alsa -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
|
||||
else()
|
||||
target_link_libraries(sherpa-onnx-alsa asound)
|
||||
set(exes
|
||||
sherpa-onnx-alsa
|
||||
sherpa-onnx-offline-tts-play-alsa
|
||||
)
|
||||
foreach(exe IN LISTS exes)
|
||||
target_link_libraries(${exe} sherpa-onnx-core)
|
||||
endforeach()
|
||||
|
||||
foreach(exe IN LISTS exes)
|
||||
if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
|
||||
target_link_libraries(${exe} -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
|
||||
else()
|
||||
target_link_libraries(${exe} asound)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
if(NOT WIN32)
|
||||
foreach(exe IN LISTS exes)
|
||||
target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib")
|
||||
target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib")
|
||||
endforeach()
|
||||
|
||||
if(SHERPA_ONNX_ENABLE_PYTHON)
|
||||
foreach(exe IN LISTS exes)
|
||||
target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib")
|
||||
endforeach()
|
||||
endif()
|
||||
endif()
|
||||
install(TARGETS sherpa-onnx-alsa DESTINATION bin)
|
||||
|
||||
install(
|
||||
TARGETS ${exes}
|
||||
DESTINATION
|
||||
bin
|
||||
)
|
||||
endif()
|
||||
|
||||
if(SHERPA_ONNX_ENABLE_PORTAUDIO)
|
||||
|
||||
150
sherpa-onnx/csrc/alsa-play.cc
Normal file
150
sherpa-onnx/csrc/alsa-play.cc
Normal file
@@ -0,0 +1,150 @@
|
||||
// sherpa-onnx/csrc/alsa-play.cc
|
||||
//
|
||||
// Copyright (c) 2022-2023 Xiaomi Corporation
|
||||
|
||||
#ifdef SHERPA_ONNX_ENABLE_ALSA
|
||||
|
||||
#include "sherpa-onnx/csrc/alsa-play.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
AlsaPlay::AlsaPlay(const char *device_name, int32_t sample_rate) {
|
||||
int32_t err = snd_pcm_open(&handle_, device_name, SND_PCM_STREAM_PLAYBACK, 0);
|
||||
|
||||
if (err) {
|
||||
fprintf(stderr, "Unable to open: %s. %s\n", device_name, snd_strerror(err));
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
SetParameters(sample_rate);
|
||||
}
|
||||
|
||||
AlsaPlay::~AlsaPlay() {
|
||||
if (handle_) {
|
||||
int32_t err = snd_pcm_close(handle_);
|
||||
if (err < 0) {
|
||||
printf("Failed to close pcm: %s\n", snd_strerror(err));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void AlsaPlay::SetParameters(int32_t sample_rate) {
|
||||
// set the following parameters
|
||||
// 1. sample_rate
|
||||
// 2. sample format: int16_t
|
||||
// 3. num_channels: 1
|
||||
snd_pcm_hw_params_t *params;
|
||||
snd_pcm_hw_params_alloca(¶ms);
|
||||
snd_pcm_hw_params_any(handle_, params);
|
||||
|
||||
int32_t err = snd_pcm_hw_params_set_access(handle_, params,
|
||||
SND_PCM_ACCESS_RW_INTERLEAVED);
|
||||
if (err < 0) {
|
||||
printf("SND_PCM_ACCESS_RW_INTERLEAVED is not supported: %s\n",
|
||||
snd_strerror(err));
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
err = snd_pcm_hw_params_set_format(handle_, params, SND_PCM_FORMAT_S16_LE);
|
||||
|
||||
if (err < 0) {
|
||||
printf("Can't set format to 16-bit: %s\n", snd_strerror(err));
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
err = snd_pcm_hw_params_set_channels(handle_, params, 1);
|
||||
|
||||
if (err < 0) {
|
||||
printf("Can't set channel number to 1: %s\n", snd_strerror(err));
|
||||
}
|
||||
|
||||
uint32_t rate = sample_rate;
|
||||
err = snd_pcm_hw_params_set_rate_near(handle_, params, &rate, 0);
|
||||
if (err < 0) {
|
||||
printf("Can't set rate to %d. %s\n", rate, snd_strerror(err));
|
||||
}
|
||||
|
||||
err = snd_pcm_hw_params(handle_, params);
|
||||
if (err < 0) {
|
||||
printf("Can't set hardware parameters. %s\n", snd_strerror(err));
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
uint32_t tmp;
|
||||
snd_pcm_hw_params_get_rate(params, &tmp, 0);
|
||||
int32_t actual_sample_rate = tmp;
|
||||
if (actual_sample_rate != sample_rate) {
|
||||
fprintf(stderr,
|
||||
"Creating a resampler:\n"
|
||||
" in_sample_rate: %d\n"
|
||||
" output_sample_rate: %d\n",
|
||||
sample_rate, actual_sample_rate);
|
||||
|
||||
float min_freq = std::min(actual_sample_rate, sample_rate);
|
||||
float lowpass_cutoff = 0.99 * 0.5 * min_freq;
|
||||
|
||||
int32_t lowpass_filter_width = 6;
|
||||
resampler_ = std::make_unique<LinearResample>(
|
||||
sample_rate, actual_sample_rate, lowpass_cutoff, lowpass_filter_width);
|
||||
}
|
||||
|
||||
snd_pcm_uframes_t frames;
|
||||
snd_pcm_hw_params_get_period_size(params, &frames, 0);
|
||||
buf_.resize(frames);
|
||||
}
|
||||
|
||||
void AlsaPlay::Play(const std::vector<float> &samples) {
|
||||
std::vector<float> tmp;
|
||||
const float *p = samples.data();
|
||||
int32_t num_samples = samples.size();
|
||||
if (resampler_) {
|
||||
resampler_->Resample(samples.data(), samples.size(), false, &tmp);
|
||||
p = tmp.data();
|
||||
num_samples = tmp.size();
|
||||
}
|
||||
|
||||
int32_t frames = buf_.size();
|
||||
int32_t i = 0;
|
||||
for (; i + frames < num_samples; i += frames) {
|
||||
for (int32_t k = 0; k != frames; ++k) {
|
||||
buf_[k] = p[i + k] * 32767;
|
||||
}
|
||||
|
||||
int32_t err = snd_pcm_writei(handle_, buf_.data(), frames);
|
||||
if (err == -EPIPE) {
|
||||
printf("XRUN.\n");
|
||||
snd_pcm_prepare(handle_);
|
||||
} else if (err < 0) {
|
||||
printf("Can't write to PCM device: %s\n", snd_strerror(err));
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
if (i < num_samples) {
|
||||
for (int32_t k = 0; k + i < num_samples; ++k) {
|
||||
buf_[k] = p[i + k] * 32767;
|
||||
}
|
||||
|
||||
int32_t err = snd_pcm_writei(handle_, buf_.data(), num_samples - i);
|
||||
if (err == -EPIPE) {
|
||||
printf("XRUN.\n");
|
||||
snd_pcm_prepare(handle_);
|
||||
} else if (err < 0) {
|
||||
printf("Can't write to PCM device: %s\n", snd_strerror(err));
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void AlsaPlay::Drain() {
|
||||
int32_t err = snd_pcm_drain(handle_);
|
||||
if (err < 0) {
|
||||
printf("Failed to drain pcm. %s\n", snd_strerror(err));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_ENABLE_ALSA
|
||||
37
sherpa-onnx/csrc/alsa-play.h
Normal file
37
sherpa-onnx/csrc/alsa-play.h
Normal file
@@ -0,0 +1,37 @@
|
||||
// sherpa-onnx/csrc/alsa-play.h
|
||||
//
|
||||
// Copyright (c) 2022-2023 Xiaomi Corporation
|
||||
|
||||
#ifndef SHERPA_ONNX_CSRC_ALSA_PLAY_H_
|
||||
#define SHERPA_ONNX_CSRC_ALSA_PLAY_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "alsa/asoundlib.h"
|
||||
#include "sherpa-onnx/csrc/resample.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
class AlsaPlay {
|
||||
public:
|
||||
AlsaPlay(const char *device_name, int32_t sample_rate);
|
||||
~AlsaPlay();
|
||||
void Play(const std::vector<float> &samples);
|
||||
|
||||
// wait for all the samples to be played
|
||||
void Drain();
|
||||
|
||||
private:
|
||||
void SetParameters(int32_t sample_rate);
|
||||
|
||||
private:
|
||||
snd_pcm_t *handle_ = nullptr;
|
||||
std::unique_ptr<LinearResample> resampler_;
|
||||
std::vector<int16_t> buf_;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_ALSA_PLAY_H_
|
||||
218
sherpa-onnx/csrc/sherpa-onnx-offline-tts-play-alsa.cc
Normal file
218
sherpa-onnx/csrc/sherpa-onnx-offline-tts-play-alsa.cc
Normal file
@@ -0,0 +1,218 @@
|
||||
// sherpa-onnx/csrc/sherpa-onnx-tts-play-alsa.cc
|
||||
//
|
||||
// Copyright (c) 2022-2023 Xiaomi Corporation
|
||||
|
||||
// see https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m.html
|
||||
// https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m___h_w___params.html
|
||||
// https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m.html
|
||||
|
||||
#include <signal.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono> // NOLINT
|
||||
#include <condition_variable> // NOLINT
|
||||
#include <fstream>
|
||||
#include <mutex> // NOLINT
|
||||
#include <queue>
|
||||
#include <thread> // NOLINT
|
||||
#include <vector>
|
||||
|
||||
#include "sherpa-onnx/csrc/alsa-play.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts.h"
|
||||
#include "sherpa-onnx/csrc/parse-options.h"
|
||||
#include "sherpa-onnx/csrc/wave-writer.h"
|
||||
|
||||
static std::condition_variable g_cv;
|
||||
static std::mutex g_cv_m;
|
||||
|
||||
struct Buffer {
|
||||
std::queue<std::vector<float>> samples;
|
||||
std::mutex mutex;
|
||||
};
|
||||
|
||||
static Buffer g_buffer;
|
||||
|
||||
static bool g_stopped = false;
|
||||
static bool g_killed = false;
|
||||
|
||||
static void Handler(int32_t /*sig*/) {
|
||||
if (g_killed) {
|
||||
exit(0);
|
||||
}
|
||||
|
||||
g_killed = true;
|
||||
fprintf(stderr, "\nCaught Ctrl + C. Exiting\n");
|
||||
}
|
||||
|
||||
static void AudioGeneratedCallback(const float *s, int32_t n) {
|
||||
if (n > 0) {
|
||||
std::lock_guard<std::mutex> lock(g_buffer.mutex);
|
||||
g_buffer.samples.push({s, s + n});
|
||||
g_cv.notify_all();
|
||||
}
|
||||
}
|
||||
|
||||
static void StartPlayback(const std::string &device_name, int32_t sample_rate) {
|
||||
sherpa_onnx::AlsaPlay alsa(device_name.c_str(), sample_rate);
|
||||
|
||||
std::unique_lock<std::mutex> lock(g_cv_m);
|
||||
while (!g_killed && !g_stopped) {
|
||||
while (!g_buffer.samples.empty()) {
|
||||
auto &p = g_buffer.samples.front();
|
||||
alsa.Play(p);
|
||||
g_buffer.samples.pop();
|
||||
}
|
||||
|
||||
g_cv.wait(lock);
|
||||
}
|
||||
|
||||
if (g_killed) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (g_stopped) {
|
||||
while (!g_buffer.samples.empty()) {
|
||||
auto &p = g_buffer.samples.front();
|
||||
alsa.Play(p);
|
||||
g_buffer.samples.pop();
|
||||
}
|
||||
}
|
||||
|
||||
alsa.Drain();
|
||||
}
|
||||
|
||||
int main(int32_t argc, char *argv[]) {
|
||||
signal(SIGINT, Handler);
|
||||
|
||||
const char *kUsageMessage = R"usage(
|
||||
Offline text-to-speech with sherpa-onnx.
|
||||
|
||||
It plays the generated audio as the model is processing.
|
||||
|
||||
Note that it is alsa so it works only on **Linux**. For instance, you can
|
||||
use it on Raspberry Pi.
|
||||
|
||||
Usage example:
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
||||
tar xf vits-piper-en_US-amy-low.tar.bz2
|
||||
|
||||
./bin/sherpa-onnx-offline-tts-play-alsa \
|
||||
--vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
|
||||
--vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \
|
||||
--vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
|
||||
--output-filename=./generated.wav \
|
||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||||
|
||||
It will generate a file ./generated.wav as specified by --output-filename.
|
||||
|
||||
You can find more models at
|
||||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
||||
|
||||
Please see
|
||||
https://k2-fsa.github.io/sherpa/onnx/tts/index.html
|
||||
or details.
|
||||
)usage";
|
||||
|
||||
sherpa_onnx::ParseOptions po(kUsageMessage);
|
||||
std::string device_name = "default";
|
||||
std::string output_filename = "./generated.wav";
|
||||
int32_t sid = 0;
|
||||
|
||||
po.Register("output-filename", &output_filename,
|
||||
"Path to save the generated audio");
|
||||
|
||||
po.Register("device-name", &device_name,
|
||||
"Name of the device to play the generated audio");
|
||||
|
||||
po.Register("sid", &sid,
|
||||
"Speaker ID. Used only for multi-speaker models, e.g., models "
|
||||
"trained using the VCTK dataset. Not used for single-speaker "
|
||||
"models, e.g., models trained using the LJSpeech dataset");
|
||||
|
||||
sherpa_onnx::OfflineTtsConfig config;
|
||||
|
||||
config.Register(&po);
|
||||
po.Read(argc, argv);
|
||||
|
||||
if (po.NumArgs() == 0) {
|
||||
fprintf(stderr, "Error: Please provide the text to generate audio.\n\n");
|
||||
po.PrintUsage();
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (po.NumArgs() > 1) {
|
||||
fprintf(stderr,
|
||||
"Error: Accept only one positional argument. Please use single "
|
||||
"quotes to wrap your text\n");
|
||||
po.PrintUsage();
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (!config.Validate()) {
|
||||
fprintf(stderr, "Errors in config!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (config.max_num_sentences != 1) {
|
||||
fprintf(stderr, "Setting config.max_num_sentences to 1\n");
|
||||
config.max_num_sentences = 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "Loading the model\n");
|
||||
sherpa_onnx::OfflineTts tts(config);
|
||||
|
||||
fprintf(stderr, "Start the playback thread\n");
|
||||
std::thread playback_thread(StartPlayback, device_name, tts.SampleRate());
|
||||
|
||||
float speed = 1.0;
|
||||
|
||||
fprintf(stderr, "Generating ...\n");
|
||||
const auto begin = std::chrono::steady_clock::now();
|
||||
auto audio = tts.Generate(po.GetArg(1), sid, speed, AudioGeneratedCallback);
|
||||
const auto end = std::chrono::steady_clock::now();
|
||||
g_stopped = true;
|
||||
g_cv.notify_all();
|
||||
fprintf(stderr, "Generating done!\n");
|
||||
if (audio.samples.empty()) {
|
||||
fprintf(
|
||||
stderr,
|
||||
"Error in generating audio. Please read previous error messages.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
float elapsed_seconds =
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
|
||||
.count() /
|
||||
1000.;
|
||||
float duration = audio.samples.size() / static_cast<float>(audio.sample_rate);
|
||||
|
||||
float rtf = elapsed_seconds / duration;
|
||||
fprintf(stderr, "Elapsed seconds: %.3f s\n", elapsed_seconds);
|
||||
fprintf(stderr, "Audio duration: %.3f s\n", duration);
|
||||
fprintf(stderr, "Real-time factor (RTF): %.3f/%.3f = %.3f\n", elapsed_seconds,
|
||||
duration, rtf);
|
||||
|
||||
bool ok = sherpa_onnx::WriteWave(output_filename, audio.sample_rate,
|
||||
audio.samples.data(), audio.samples.size());
|
||||
if (!ok) {
|
||||
fprintf(stderr, "Failed to write wave to %s\n", output_filename.c_str());
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
fprintf(stderr, "The text is: %s. Speaker ID: %d\n\n", po.GetArg(1).c_str(),
|
||||
sid);
|
||||
fprintf(stderr, "\n**** Saved to %s successfully! ****\n",
|
||||
output_filename.c_str());
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(
|
||||
stderr,
|
||||
"Wait for the playback to finish. You can safely press ctrl + C to stop "
|
||||
"the playback.\n");
|
||||
playback_thread.join();
|
||||
|
||||
fprintf(stderr, "Done!\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user