Support playing generated audio as it is generating for MFC. (#462)
* Support playing generated audio as it is generating for MFC. * support espeak-ng-data
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -80,3 +80,4 @@ jslint.mjs
|
||||
vits-piper-en_US-amy-low
|
||||
vits-piper-*-*-*
|
||||
log
|
||||
*.exe
|
||||
|
||||
@@ -214,7 +214,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() {
|
||||
param.sampleFormat = paFloat32;
|
||||
param.suggestedLatency = info->defaultLowInputLatency;
|
||||
param.hostApiSpecificStreamInfo = nullptr;
|
||||
float sample_rate = config_.feat_config.sample_rate;
|
||||
float sample_rate = static_cast<float>(config_.feat_config.sample_rate);
|
||||
pa_stream_ = nullptr;
|
||||
PaError err =
|
||||
Pa_OpenStream(&pa_stream_, ¶m, nullptr, /* &outputParameters, */
|
||||
@@ -259,7 +259,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() {
|
||||
SherpaOnnxOfflineStream *stream = CreateOfflineStream(recognizer_);
|
||||
|
||||
AcceptWaveformOffline(stream, config_.feat_config.sample_rate,
|
||||
samples_.data(), samples_.size());
|
||||
samples_.data(), static_cast<int32_t>(samples_.size()));
|
||||
DecodeOfflineStream(recognizer_, stream);
|
||||
auto r = GetOfflineStreamResult(stream);
|
||||
results_.emplace_back(r->text);
|
||||
|
||||
Binary file not shown.
@@ -9,14 +9,184 @@
|
||||
#include "afxdialogex.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <mutex> // NOLINT
|
||||
#include <queue>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <thread> // NOLINT
|
||||
#include <vector>
|
||||
|
||||
#ifdef _DEBUG
|
||||
#define new DEBUG_NEW
|
||||
#endif
|
||||
|
||||
Microphone::Microphone() {
|
||||
PaError err = Pa_Initialize();
|
||||
if (err != paNoError) {
|
||||
fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
|
||||
exit(-2);
|
||||
}
|
||||
}
|
||||
|
||||
Microphone::~Microphone() {
|
||||
PaError err = Pa_Terminate();
|
||||
if (err != paNoError) {
|
||||
fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
|
||||
exit(-2);
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE(fangjun): Code is copied from
|
||||
// https://github.com/k2-fsa/sherpa-onnx/blob/master/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play.cc#L22
|
||||
static std::condition_variable g_cv;
|
||||
static std::mutex g_cv_m;
|
||||
|
||||
struct Samples {
|
||||
std::vector<float> data;
|
||||
int32_t consumed = 0;
|
||||
};
|
||||
|
||||
struct Buffer {
|
||||
std::queue<Samples> samples;
|
||||
std::mutex mutex;
|
||||
};
|
||||
|
||||
static Buffer g_buffer;
|
||||
|
||||
static bool g_started = false;
|
||||
static bool g_stopped = false;
|
||||
static bool g_killed = false;
|
||||
|
||||
static void AudioGeneratedCallback(const float *s, int32_t n) {
|
||||
if (n > 0) {
|
||||
Samples samples;
|
||||
samples.data = std::vector<float>{s, s + n};
|
||||
|
||||
std::lock_guard<std::mutex> lock(g_buffer.mutex);
|
||||
g_buffer.samples.push(std::move(samples));
|
||||
g_started = true;
|
||||
}
|
||||
}
|
||||
|
||||
static int PlayCallback(const void * /*in*/, void *out,
|
||||
unsigned long _n, // NOLINT
|
||||
const PaStreamCallbackTimeInfo * /*time_info*/,
|
||||
PaStreamCallbackFlags /*status_flags*/,
|
||||
void * /*user_data*/) {
|
||||
int32_t n = static_cast<int32_t>(_n);
|
||||
if (g_killed) {
|
||||
return paComplete;
|
||||
}
|
||||
|
||||
float *pout = reinterpret_cast<float *>(out);
|
||||
std::lock_guard<std::mutex> lock(g_buffer.mutex);
|
||||
|
||||
if (g_buffer.samples.empty()) {
|
||||
if (g_stopped) {
|
||||
// no more data is available and we have processed all of the samples
|
||||
return paComplete;
|
||||
}
|
||||
|
||||
// The current sentence is so long, though very unlikely, that
|
||||
// the model has not finished processing it yet.
|
||||
std::fill_n(pout, n, 0);
|
||||
|
||||
return paContinue;
|
||||
}
|
||||
|
||||
int32_t k = 0;
|
||||
for (; k < n && !g_buffer.samples.empty();) {
|
||||
int32_t this_block = n - k;
|
||||
|
||||
auto &p = g_buffer.samples.front();
|
||||
|
||||
int32_t remaining = static_cast<int32_t>(p.data.size()) - p.consumed;
|
||||
|
||||
if (this_block <= remaining) {
|
||||
std::copy(p.data.begin() + p.consumed,
|
||||
p.data.begin() + p.consumed + this_block, pout + k);
|
||||
p.consumed += this_block;
|
||||
|
||||
k = n;
|
||||
|
||||
if (p.consumed == p.data.size()) {
|
||||
g_buffer.samples.pop();
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
std::copy(p.data.begin() + p.consumed, p.data.end(), pout + k);
|
||||
k += static_cast<int32_t>(p.data.size()) - p.consumed;
|
||||
g_buffer.samples.pop();
|
||||
}
|
||||
|
||||
if (k < n) {
|
||||
std::fill_n(pout + k, n - k, 0);
|
||||
}
|
||||
|
||||
if (g_stopped && g_buffer.samples.empty()) {
|
||||
return paComplete;
|
||||
}
|
||||
|
||||
return paContinue;
|
||||
}
|
||||
|
||||
static void PlayCallbackFinished(void *userData) { g_cv.notify_all(); }
|
||||
|
||||
static void StartPlayback(int32_t sample_rate) {
|
||||
int32_t frames_per_buffer = 1024;
|
||||
PaStreamParameters outputParameters;
|
||||
PaStream *stream;
|
||||
PaError err;
|
||||
|
||||
outputParameters.device =
|
||||
Pa_GetDefaultOutputDevice(); /* default output device */
|
||||
|
||||
outputParameters.channelCount = 1; /* stereo output */
|
||||
outputParameters.sampleFormat = paFloat32; /* 32 bit floating point output */
|
||||
outputParameters.suggestedLatency =
|
||||
Pa_GetDeviceInfo(outputParameters.device)->defaultLowOutputLatency;
|
||||
outputParameters.hostApiSpecificStreamInfo = nullptr;
|
||||
|
||||
err = Pa_OpenStream(&stream, nullptr, /* no input */
|
||||
&outputParameters, sample_rate, frames_per_buffer,
|
||||
paClipOff, // we won't output out of range samples so
|
||||
// don't bother clipping them
|
||||
PlayCallback, nullptr);
|
||||
if (err != paNoError) {
|
||||
fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err));
|
||||
return;
|
||||
}
|
||||
|
||||
err = Pa_SetStreamFinishedCallback(stream, &PlayCallbackFinished);
|
||||
if (err != paNoError) {
|
||||
fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err));
|
||||
return;
|
||||
}
|
||||
|
||||
err = Pa_StartStream(stream);
|
||||
if (err != paNoError) {
|
||||
fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err));
|
||||
return;
|
||||
}
|
||||
|
||||
std::unique_lock<std::mutex> lock(g_cv_m);
|
||||
while (!g_killed && !g_stopped &&
|
||||
(!g_started || (g_started && !g_buffer.samples.empty()))) {
|
||||
g_cv.wait(lock);
|
||||
}
|
||||
|
||||
err = Pa_StopStream(stream);
|
||||
if (err != paNoError) {
|
||||
return;
|
||||
}
|
||||
|
||||
err = Pa_CloseStream(stream);
|
||||
if (err != paNoError) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// CAboutDlg dialog used for App About
|
||||
|
||||
@@ -261,8 +431,8 @@ void CNonStreamingTextToSpeechDlg::Init() {
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!Exists("./lexicon.txt")) {
|
||||
error_message += "Cannot find ./lexicon.txt\r\n";
|
||||
if (!Exists("./lexicon.txt") && !Exists("./espeak-ng-data/phontab")) {
|
||||
error_message += "Cannot find espeak-ng-data directory or ./lexicon.txt\r\n";
|
||||
ok = false;
|
||||
}
|
||||
|
||||
@@ -275,21 +445,17 @@ void CNonStreamingTextToSpeechDlg::Init() {
|
||||
generate_btn_.EnableWindow(FALSE);
|
||||
error_message +=
|
||||
"\r\nPlease refer to\r\n"
|
||||
"https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html";
|
||||
"https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models";
|
||||
error_message += "\r\nto download models.\r\n";
|
||||
error_message += "\r\nWe given an example below\r\n\r\n";
|
||||
error_message += "\r\nWe give an example below\r\n\r\n";
|
||||
error_message +=
|
||||
"wget -O model.onnx "
|
||||
"https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/"
|
||||
"vits-aishell3.onnx\r\n";
|
||||
error_message +=
|
||||
"wget "
|
||||
"https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/"
|
||||
"lexicon.txt\r\n";
|
||||
error_message +=
|
||||
"wget "
|
||||
"https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/"
|
||||
"tokens.txt\r\n";
|
||||
"1. Download vits-piper-en_US-amy-low.tar.bz2 from the following URL\r\n\r\n"
|
||||
"https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2\r\n\r\n"
|
||||
"2. Uncompress it and you will get a directory vits-piper-en_US-amy-low \r\n\r\n"
|
||||
"3. Switch to the directory vits-piper-en_US-amy-low \r\n\r\n"
|
||||
"4. Rename en_US-amy-low.onnx to model.onnx \r\n\r\n"
|
||||
"5. Copy the current exe to the directory vits-piper-en_US-amy-low\r\n\r\n"
|
||||
"6. Done! You can now run the exe in the directory vits-piper-en_US-amy-low\r\n\r\n";
|
||||
|
||||
AppendLineToMultilineEditCtrl(my_hint_, error_message);
|
||||
return;
|
||||
@@ -299,10 +465,14 @@ void CNonStreamingTextToSpeechDlg::Init() {
|
||||
SherpaOnnxOfflineTtsConfig config;
|
||||
memset(&config, 0, sizeof(config));
|
||||
config.model.debug = 0;
|
||||
config.model.num_threads = 1;
|
||||
config.model.num_threads = 2;
|
||||
config.model.provider = "cpu";
|
||||
config.model.vits.model = "./model.onnx";
|
||||
config.model.vits.lexicon = "./lexicon.txt";
|
||||
if (Exists("./espeak-ng-data/phontab")) {
|
||||
config.model.vits.data_dir = "./espeak-ng-data";
|
||||
} else {
|
||||
config.model.vits.lexicon = "./lexicon.txt";
|
||||
}
|
||||
config.model.vits.tokens = "./tokens.txt";
|
||||
|
||||
tts_ = SherpaOnnxCreateOfflineTts(&config);
|
||||
@@ -321,7 +491,6 @@ void CNonStreamingTextToSpeechDlg::Init() {
|
||||
}
|
||||
|
||||
void CNonStreamingTextToSpeechDlg::OnBnClickedOk() {
|
||||
// TODO: Add your control notification handler code here
|
||||
CString s;
|
||||
speaker_id_.GetWindowText(s);
|
||||
int speaker_id = _ttoi(s);
|
||||
@@ -338,25 +507,51 @@ void CNonStreamingTextToSpeechDlg::OnBnClickedOk() {
|
||||
}
|
||||
|
||||
my_text_.GetWindowText(s);
|
||||
|
||||
std::string ss = ToString(s);
|
||||
if (ss.empty()) {
|
||||
AfxMessageBox(Utf8ToUtf16("Please input your text").c_str(), MB_OK);
|
||||
return;
|
||||
}
|
||||
|
||||
if (play_thread_) {
|
||||
g_killed = true;
|
||||
g_stopped = true;
|
||||
if (play_thread_->joinable()) {
|
||||
play_thread_->join();
|
||||
}
|
||||
}
|
||||
|
||||
g_killed = false;
|
||||
g_stopped = false;
|
||||
g_started = false;
|
||||
g_buffer.samples = {};
|
||||
|
||||
// Caution(fangjun): It is not efficient to re-create the thread. We use this approach
|
||||
// for simplicity
|
||||
play_thread_ = std::make_unique<std::thread>(StartPlayback, SherpaOnnxOfflineTtsSampleRate(tts_));
|
||||
|
||||
generate_btn_.EnableWindow(FALSE);
|
||||
|
||||
const SherpaOnnxGeneratedAudio *audio =
|
||||
SherpaOnnxOfflineTtsGenerate(tts_, ss.c_str(), speaker_id, speed);
|
||||
SherpaOnnxOfflineTtsGenerateWithCallback(tts_, ss.c_str(), speaker_id, speed, &AudioGeneratedCallback);
|
||||
|
||||
generate_btn_.EnableWindow(TRUE);
|
||||
|
||||
output_filename_.GetWindowText(s);
|
||||
std::string filename = ToString(s);
|
||||
|
||||
int ok = SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate,
|
||||
filename.c_str());
|
||||
|
||||
SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
|
||||
|
||||
if (ok) {
|
||||
AfxMessageBox(Utf8ToUtf16(std::string("Saved to ") + filename + " successfully").c_str(), MB_OK);
|
||||
// AfxMessageBox(Utf8ToUtf16(std::string("Saved to ") + filename + " successfully").c_str(), MB_OK);
|
||||
AppendLineToMultilineEditCtrl(my_hint_, std::string("Saved to ") + filename + " successfully");
|
||||
} else {
|
||||
AfxMessageBox(Utf8ToUtf16(std::string("Failed to save to ") + filename).c_str(), MB_OK);
|
||||
// AfxMessageBox(Utf8ToUtf16(std::string("Failed to save to ") + filename).c_str(), MB_OK);
|
||||
AppendLineToMultilineEditCtrl(my_hint_, std::string("Failed to saved to ") + filename);
|
||||
}
|
||||
|
||||
//CDialogEx::OnOK();
|
||||
|
||||
@@ -6,6 +6,16 @@
|
||||
|
||||
#include "sherpa-onnx/c-api/c-api.h"
|
||||
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
|
||||
#include "portaudio.h"
|
||||
|
||||
class Microphone {
|
||||
public:
|
||||
Microphone();
|
||||
~Microphone();
|
||||
};
|
||||
|
||||
// CNonStreamingTextToSpeechDlg dialog
|
||||
class CNonStreamingTextToSpeechDlg : public CDialogEx
|
||||
@@ -34,16 +44,21 @@ protected:
|
||||
afx_msg void OnPaint();
|
||||
afx_msg HCURSOR OnQueryDragIcon();
|
||||
DECLARE_MESSAGE_MAP()
|
||||
public:
|
||||
CEdit my_hint_;
|
||||
CEdit speaker_id_;
|
||||
CEdit speed_;
|
||||
void Init();
|
||||
void InitHint();
|
||||
CButton generate_btn_;
|
||||
afx_msg void OnBnClickedOk();
|
||||
public:
|
||||
CEdit my_hint_;
|
||||
CEdit speaker_id_;
|
||||
CEdit speed_;
|
||||
void Init();
|
||||
void InitHint();
|
||||
CButton generate_btn_;
|
||||
afx_msg void OnBnClickedOk();
|
||||
|
||||
SherpaOnnxOfflineTts *tts_ = nullptr;
|
||||
CEdit my_text_;
|
||||
CEdit output_filename_;
|
||||
|
||||
private:
|
||||
Microphone mic_;
|
||||
std::unique_ptr<std::thread> play_thread_;
|
||||
|
||||
SherpaOnnxOfflineTts *tts_;
|
||||
CEdit my_text_;
|
||||
CEdit output_filename_;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user