Play generated audio using alsa for TTS (#482)

2023-12-13 22:28:03 +08:00
parent 9829d7c4d3
commit b18812ceff
7 changed files with 465 additions and 10 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 project(sherpa-onnx)

-set(SHERPA_ONNX_VERSION "1.9.3")
+set(SHERPA_ONNX_VERSION "1.9.4")

 # Disable warning about
 #
@@ -106,10 +106,23 @@ endif()
 set(CMAKE_CXX_EXTENSIONS OFF)
 message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}")

+
 include(CheckIncludeFileCXX)
-check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA)
-if(SHERPA_ONNX_HAS_ALSA)
-  add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1)
+
+if(UNIX AND NOT APPLE)
+  check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA)
+  if(SHERPA_ONNX_HAS_ALSA)
+    add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1)
+  else()
+    message(WARNING "\
+Could not find alsa/asoundlib.h !
+We won't build sherpa-ncnn-alsa
+To fix that, please do:
+  (1) sudo apt-get install alsa-utils libasound2-dev
+  (2) rm -rf build
+  (3) re-try
+  ")
+  endif()
 endif()

 check_include_file_cxx(cxxabi.h SHERPA_ONNX_HAVE_CXXABI_H)
--- a/cmake/cmake_extension.py
+++ b/cmake/cmake_extension.py
@@ -144,6 +144,8 @@ class BuildExtension(build_ext):
        binaries += ["sherpa-onnx-vad-microphone-offline-asr"]
        binaries += ["sherpa-onnx-offline-tts"]
        binaries += ["sherpa-onnx-offline-tts-play"]
+        binaries += ["sherpa-onnx-alsa"]
+        binaries += ["sherpa-onnx-offline-tts-play-alsa"]

        if is_windows():
            binaries += ["kaldi-native-fbank-core.dll"]
@@ -165,6 +167,11 @@ class BuildExtension(build_ext):
                src_file = install_dir / "lib" / (f + suffix)
            if not src_file.is_file():
                src_file = install_dir / ".." / (f + suffix)
+
+            if not src_file.is_file() and 'alsa' in f:
+                print(f'Skipping {f}')
+                continue
+
            print(f"Copying {src_file} to {out_bin_dir}/")
            shutil.copy(f"{src_file}", f"{out_bin_dir}/")

--- a/setup.py
+++ b/setup.py
@@ -60,6 +60,8 @@ def get_binaries_to_install():
    binaries += ["sherpa-onnx-vad-microphone-offline-asr"]
    binaries += ["sherpa-onnx-offline-tts"]
    binaries += ["sherpa-onnx-offline-tts-play"]
+    binaries += ["sherpa-onnx-alsa"]
+    binaries += ["sherpa-onnx-offline-tts-play-alsa"]
    if is_windows():
        binaries += ["kaldi-native-fbank-core.dll"]
        binaries += ["sherpa-onnx-c-api.dll"]
--- a/sherpa-onnx/csrc/CMakeLists.txt
+++ b/sherpa-onnx/csrc/CMakeLists.txt
@@ -207,14 +207,42 @@ install(

 if(SHERPA_ONNX_HAS_ALSA)
  add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc)
-  target_link_libraries(sherpa-onnx-alsa sherpa-onnx-core)
+  add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc)

-  if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
-    target_link_libraries(sherpa-onnx-alsa -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
-  else()
-    target_link_libraries(sherpa-onnx-alsa asound)
+  set(exes
+    sherpa-onnx-alsa
+    sherpa-onnx-offline-tts-play-alsa
+  )
+  foreach(exe IN LISTS exes)
+    target_link_libraries(${exe} sherpa-onnx-core)
+  endforeach()
+
+  foreach(exe IN LISTS exes)
+    if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
+      target_link_libraries(${exe} -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
+    else()
+      target_link_libraries(${exe} asound)
+    endif()
+  endforeach()
+
+  if(NOT WIN32)
+    foreach(exe IN LISTS exes)
+      target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib")
+      target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib")
+    endforeach()
+
+    if(SHERPA_ONNX_ENABLE_PYTHON)
+      foreach(exe IN LISTS exes)
+        target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib")
+      endforeach()
+    endif()
  endif()
-  install(TARGETS sherpa-onnx-alsa DESTINATION bin)
+
+  install(
+    TARGETS ${exes}
+    DESTINATION
+      bin
+  )
 endif()

 if(SHERPA_ONNX_ENABLE_PORTAUDIO)
--- a/sherpa-onnx/csrc/alsa-play.cc
+++ b/sherpa-onnx/csrc/alsa-play.cc
@@ -0,0 +1,150 @@
+// sherpa-onnx/csrc/alsa-play.cc
+//
+// Copyright (c)  2022-2023  Xiaomi Corporation
+
+#ifdef SHERPA_ONNX_ENABLE_ALSA
+
+#include "sherpa-onnx/csrc/alsa-play.h"
+
+#include <algorithm>
+
+namespace sherpa_onnx {
+
+AlsaPlay::AlsaPlay(const char *device_name, int32_t sample_rate) {
+  int32_t err = snd_pcm_open(&handle_, device_name, SND_PCM_STREAM_PLAYBACK, 0);
+
+  if (err) {
+    fprintf(stderr, "Unable to open: %s. %s\n", device_name, snd_strerror(err));
+    exit(-1);
+  }
+
+  SetParameters(sample_rate);
+}
+
+AlsaPlay::~AlsaPlay() {
+  if (handle_) {
+    int32_t err = snd_pcm_close(handle_);
+    if (err < 0) {
+      printf("Failed to close pcm: %s\n", snd_strerror(err));
+    }
+  }
+}
+
+void AlsaPlay::SetParameters(int32_t sample_rate) {
+  // set the following parameters
+  // 1. sample_rate
+  // 2. sample format: int16_t
+  // 3. num_channels: 1
+  snd_pcm_hw_params_t *params;
+  snd_pcm_hw_params_alloca(&params);
+  snd_pcm_hw_params_any(handle_, params);
+
+  int32_t err = snd_pcm_hw_params_set_access(handle_, params,
+                                             SND_PCM_ACCESS_RW_INTERLEAVED);
+  if (err < 0) {
+    printf("SND_PCM_ACCESS_RW_INTERLEAVED is not supported: %s\n",
+           snd_strerror(err));
+    exit(-1);
+  }
+
+  err = snd_pcm_hw_params_set_format(handle_, params, SND_PCM_FORMAT_S16_LE);
+
+  if (err < 0) {
+    printf("Can't set format to 16-bit: %s\n", snd_strerror(err));
+    exit(-1);
+  }
+
+  err = snd_pcm_hw_params_set_channels(handle_, params, 1);
+
+  if (err < 0) {
+    printf("Can't set channel number to 1: %s\n", snd_strerror(err));
+  }
+
+  uint32_t rate = sample_rate;
+  err = snd_pcm_hw_params_set_rate_near(handle_, params, &rate, 0);
+  if (err < 0) {
+    printf("Can't set rate to %d. %s\n", rate, snd_strerror(err));
+  }
+
+  err = snd_pcm_hw_params(handle_, params);
+  if (err < 0) {
+    printf("Can't set hardware parameters. %s\n", snd_strerror(err));
+    exit(-1);
+  }
+
+  uint32_t tmp;
+  snd_pcm_hw_params_get_rate(params, &tmp, 0);
+  int32_t actual_sample_rate = tmp;
+  if (actual_sample_rate != sample_rate) {
+    fprintf(stderr,
+            "Creating a resampler:\n"
+            "   in_sample_rate: %d\n"
+            "   output_sample_rate: %d\n",
+            sample_rate, actual_sample_rate);
+
+    float min_freq = std::min(actual_sample_rate, sample_rate);
+    float lowpass_cutoff = 0.99 * 0.5 * min_freq;
+
+    int32_t lowpass_filter_width = 6;
+    resampler_ = std::make_unique<LinearResample>(
+        sample_rate, actual_sample_rate, lowpass_cutoff, lowpass_filter_width);
+  }
+
+  snd_pcm_uframes_t frames;
+  snd_pcm_hw_params_get_period_size(params, &frames, 0);
+  buf_.resize(frames);
+}
+
+void AlsaPlay::Play(const std::vector<float> &samples) {
+  std::vector<float> tmp;
+  const float *p = samples.data();
+  int32_t num_samples = samples.size();
+  if (resampler_) {
+    resampler_->Resample(samples.data(), samples.size(), false, &tmp);
+    p = tmp.data();
+    num_samples = tmp.size();
+  }
+
+  int32_t frames = buf_.size();
+  int32_t i = 0;
+  for (; i + frames < num_samples; i += frames) {
+    for (int32_t k = 0; k != frames; ++k) {
+      buf_[k] = p[i + k] * 32767;
+    }
+
+    int32_t err = snd_pcm_writei(handle_, buf_.data(), frames);
+    if (err == -EPIPE) {
+      printf("XRUN.\n");
+      snd_pcm_prepare(handle_);
+    } else if (err < 0) {
+      printf("Can't write to PCM device: %s\n", snd_strerror(err));
+      exit(-1);
+    }
+  }
+
+  if (i < num_samples) {
+    for (int32_t k = 0; k + i < num_samples; ++k) {
+      buf_[k] = p[i + k] * 32767;
+    }
+
+    int32_t err = snd_pcm_writei(handle_, buf_.data(), num_samples - i);
+    if (err == -EPIPE) {
+      printf("XRUN.\n");
+      snd_pcm_prepare(handle_);
+    } else if (err < 0) {
+      printf("Can't write to PCM device: %s\n", snd_strerror(err));
+      exit(-1);
+    }
+  }
+}
+
+void AlsaPlay::Drain() {
+  int32_t err = snd_pcm_drain(handle_);
+  if (err < 0) {
+    printf("Failed to drain pcm. %s\n", snd_strerror(err));
+  }
+}
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_ENABLE_ALSA
--- a/sherpa-onnx/csrc/alsa-play.h
+++ b/sherpa-onnx/csrc/alsa-play.h
@@ -0,0 +1,37 @@
+// sherpa-onnx/csrc/alsa-play.h
+//
+// Copyright (c)  2022-2023  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_CSRC_ALSA_PLAY_H_
+#define SHERPA_ONNX_CSRC_ALSA_PLAY_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "alsa/asoundlib.h"
+#include "sherpa-onnx/csrc/resample.h"
+
+namespace sherpa_onnx {
+
+class AlsaPlay {
+ public:
+  AlsaPlay(const char *device_name, int32_t sample_rate);
+  ~AlsaPlay();
+  void Play(const std::vector<float> &samples);
+
+  // wait for all the samples to be played
+  void Drain();
+
+ private:
+  void SetParameters(int32_t sample_rate);
+
+ private:
+  snd_pcm_t *handle_ = nullptr;
+  std::unique_ptr<LinearResample> resampler_;
+  std::vector<int16_t> buf_;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_ALSA_PLAY_H_
--- a/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play-alsa.cc
+++ b/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play-alsa.cc
@@ -0,0 +1,218 @@
+// sherpa-onnx/csrc/sherpa-onnx-tts-play-alsa.cc
+//
+// Copyright (c)  2022-2023  Xiaomi Corporation
+
+// see https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m.html
+// https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m___h_w___params.html
+// https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m.html
+
+#include <signal.h>
+
+#include <algorithm>
+#include <chrono>              // NOLINT
+#include <condition_variable>  // NOLINT
+#include <fstream>
+#include <mutex>  // NOLINT
+#include <queue>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "sherpa-onnx/csrc/alsa-play.h"
+#include "sherpa-onnx/csrc/offline-tts.h"
+#include "sherpa-onnx/csrc/parse-options.h"
+#include "sherpa-onnx/csrc/wave-writer.h"
+
+static std::condition_variable g_cv;
+static std::mutex g_cv_m;
+
+struct Buffer {
+  std::queue<std::vector<float>> samples;
+  std::mutex mutex;
+};
+
+static Buffer g_buffer;
+
+static bool g_stopped = false;
+static bool g_killed = false;
+
+static void Handler(int32_t /*sig*/) {
+  if (g_killed) {
+    exit(0);
+  }
+
+  g_killed = true;
+  fprintf(stderr, "\nCaught Ctrl + C. Exiting\n");
+}
+
+static void AudioGeneratedCallback(const float *s, int32_t n) {
+  if (n > 0) {
+    std::lock_guard<std::mutex> lock(g_buffer.mutex);
+    g_buffer.samples.push({s, s + n});
+    g_cv.notify_all();
+  }
+}
+
+static void StartPlayback(const std::string &device_name, int32_t sample_rate) {
+  sherpa_onnx::AlsaPlay alsa(device_name.c_str(), sample_rate);
+
+  std::unique_lock<std::mutex> lock(g_cv_m);
+  while (!g_killed && !g_stopped) {
+    while (!g_buffer.samples.empty()) {
+      auto &p = g_buffer.samples.front();
+      alsa.Play(p);
+      g_buffer.samples.pop();
+    }
+
+    g_cv.wait(lock);
+  }
+
+  if (g_killed) {
+    return;
+  }
+
+  if (g_stopped) {
+    while (!g_buffer.samples.empty()) {
+      auto &p = g_buffer.samples.front();
+      alsa.Play(p);
+      g_buffer.samples.pop();
+    }
+  }
+
+  alsa.Drain();
+}
+
+int main(int32_t argc, char *argv[]) {
+  signal(SIGINT, Handler);
+
+  const char *kUsageMessage = R"usage(
+Offline text-to-speech with sherpa-onnx.
+
+It plays the generated audio as the model is processing.
+
+Note that it is alsa so it works only on **Linux**. For instance, you can
+use it on Raspberry Pi.
+
+Usage example:
+
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
+tar xf vits-piper-en_US-amy-low.tar.bz2
+
+./bin/sherpa-onnx-offline-tts-play-alsa \
+ --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
+ --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \
+ --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
+ --output-filename=./generated.wav \
+ "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
+
+It will generate a file ./generated.wav as specified by --output-filename.
+
+You can find more models at
+https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
+
+Please see
+https://k2-fsa.github.io/sherpa/onnx/tts/index.html
+or details.
+)usage";
+
+  sherpa_onnx::ParseOptions po(kUsageMessage);
+  std::string device_name = "default";
+  std::string output_filename = "./generated.wav";
+  int32_t sid = 0;
+
+  po.Register("output-filename", &output_filename,
+              "Path to save the generated audio");
+
+  po.Register("device-name", &device_name,
+              "Name of the device to play the generated audio");
+
+  po.Register("sid", &sid,
+              "Speaker ID. Used only for multi-speaker models, e.g., models "
+              "trained using the VCTK dataset. Not used for single-speaker "
+              "models, e.g., models trained using the LJSpeech dataset");
+
+  sherpa_onnx::OfflineTtsConfig config;
+
+  config.Register(&po);
+  po.Read(argc, argv);
+
+  if (po.NumArgs() == 0) {
+    fprintf(stderr, "Error: Please provide the text to generate audio.\n\n");
+    po.PrintUsage();
+    exit(EXIT_FAILURE);
+  }
+
+  if (po.NumArgs() > 1) {
+    fprintf(stderr,
+            "Error: Accept only one positional argument. Please use single "
+            "quotes to wrap your text\n");
+    po.PrintUsage();
+    exit(EXIT_FAILURE);
+  }
+
+  if (!config.Validate()) {
+    fprintf(stderr, "Errors in config!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (config.max_num_sentences != 1) {
+    fprintf(stderr, "Setting config.max_num_sentences to 1\n");
+    config.max_num_sentences = 1;
+  }
+
+  fprintf(stderr, "Loading the model\n");
+  sherpa_onnx::OfflineTts tts(config);
+
+  fprintf(stderr, "Start the playback thread\n");
+  std::thread playback_thread(StartPlayback, device_name, tts.SampleRate());
+
+  float speed = 1.0;
+
+  fprintf(stderr, "Generating ...\n");
+  const auto begin = std::chrono::steady_clock::now();
+  auto audio = tts.Generate(po.GetArg(1), sid, speed, AudioGeneratedCallback);
+  const auto end = std::chrono::steady_clock::now();
+  g_stopped = true;
+  g_cv.notify_all();
+  fprintf(stderr, "Generating done!\n");
+  if (audio.samples.empty()) {
+    fprintf(
+        stderr,
+        "Error in generating audio. Please read previous error messages.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  float elapsed_seconds =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
+          .count() /
+      1000.;
+  float duration = audio.samples.size() / static_cast<float>(audio.sample_rate);
+
+  float rtf = elapsed_seconds / duration;
+  fprintf(stderr, "Elapsed seconds: %.3f s\n", elapsed_seconds);
+  fprintf(stderr, "Audio duration: %.3f s\n", duration);
+  fprintf(stderr, "Real-time factor (RTF): %.3f/%.3f = %.3f\n", elapsed_seconds,
+          duration, rtf);
+
+  bool ok = sherpa_onnx::WriteWave(output_filename, audio.sample_rate,
+                                   audio.samples.data(), audio.samples.size());
+  if (!ok) {
+    fprintf(stderr, "Failed to write wave to %s\n", output_filename.c_str());
+    exit(EXIT_FAILURE);
+  }
+
+  fprintf(stderr, "The text is: %s. Speaker ID: %d\n\n", po.GetArg(1).c_str(),
+          sid);
+  fprintf(stderr, "\n**** Saved to %s successfully! ****\n",
+          output_filename.c_str());
+
+  fprintf(stderr, "\n");
+  fprintf(
+      stderr,
+      "Wait for the playback to finish. You can safely press ctrl + C to stop "
+      "the playback.\n");
+  playback_thread.join();
+
+  fprintf(stderr, "Done!\n");
+
+  return 0;
+}