Add on-device tex-to-speech (TTS) demo for HarmonyOS (#1590)
This commit is contained in:
@@ -1169,6 +1169,17 @@ SherpaOnnxOfflineTtsGenerateWithProgressCallback(
|
||||
return SherpaOnnxOfflineTtsGenerateInternal(tts, text, sid, speed, wrapper);
|
||||
}
|
||||
|
||||
const SherpaOnnxGeneratedAudio *
|
||||
SherpaOnnxOfflineTtsGenerateWithProgressCallbackWithArg(
|
||||
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
|
||||
SherpaOnnxGeneratedAudioProgressCallbackWithArg callback, void *arg) {
|
||||
auto wrapper = [callback, arg](const float *samples, int32_t n,
|
||||
float progress) {
|
||||
return callback(samples, n, progress, arg);
|
||||
};
|
||||
return SherpaOnnxOfflineTtsGenerateInternal(tts, text, sid, speed, wrapper);
|
||||
}
|
||||
|
||||
const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(
|
||||
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
|
||||
SherpaOnnxGeneratedAudioCallbackWithArg callback, void *arg) {
|
||||
|
||||
@@ -930,6 +930,9 @@ typedef int32_t (*SherpaOnnxGeneratedAudioCallbackWithArg)(const float *samples,
|
||||
typedef int32_t (*SherpaOnnxGeneratedAudioProgressCallback)(
|
||||
const float *samples, int32_t n, float p);
|
||||
|
||||
typedef int32_t (*SherpaOnnxGeneratedAudioProgressCallbackWithArg)(
|
||||
const float *samples, int32_t n, float p, void *arg);
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts;
|
||||
|
||||
// Create an instance of offline TTS. The user has to use DestroyOfflineTts()
|
||||
@@ -964,11 +967,19 @@ SherpaOnnxOfflineTtsGenerateWithCallback(
|
||||
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
|
||||
SherpaOnnxGeneratedAudioCallback callback);
|
||||
|
||||
SHERPA_ONNX_API
|
||||
const SherpaOnnxGeneratedAudio *
|
||||
SherpaOnnxOfflineTtsGenerateWithProgressCallback(
|
||||
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
|
||||
|
||||
SherpaOnnxGeneratedAudioProgressCallback callback);
|
||||
|
||||
SHERPA_ONNX_API
|
||||
const SherpaOnnxGeneratedAudio *
|
||||
SherpaOnnxOfflineTtsGenerateWithProgressCallbackWithArg(
|
||||
const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
|
||||
SherpaOnnxGeneratedAudioProgressCallbackWithArg callback, void *arg);
|
||||
|
||||
// Same as SherpaOnnxGeneratedAudioCallback but you can pass an additional
|
||||
// `void* arg` to the callback.
|
||||
SHERPA_ONNX_API const SherpaOnnxGeneratedAudio *
|
||||
|
||||
@@ -22,8 +22,14 @@ CircularBuffer::CircularBuffer(int32_t capacity) {
|
||||
void CircularBuffer::Resize(int32_t new_capacity) {
|
||||
int32_t capacity = static_cast<int32_t>(buffer_.size());
|
||||
if (new_capacity <= capacity) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE(
|
||||
"new_capacity (%{public}d) <= original capacity (%{public}d). Skip it.",
|
||||
new_capacity, capacity);
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("new_capacity (%d) <= original capacity (%d). Skip it.",
|
||||
new_capacity, capacity);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -90,10 +96,18 @@ void CircularBuffer::Push(const float *p, int32_t n) {
|
||||
int32_t size = Size();
|
||||
if (n + size > capacity) {
|
||||
int32_t new_capacity = std::max(capacity * 2, n + size);
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Overflow! n: %{public}d, size: %{public}d, n+size: %{public}d, "
|
||||
"capacity: %{public}d. Increase "
|
||||
"capacity to: %{public}d. (Original data is copied. No data loss!)",
|
||||
n, size, n + size, capacity, new_capacity);
|
||||
#else
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Overflow! n: %d, size: %d, n+size: %d, capacity: %d. Increase "
|
||||
"capacity to: %d",
|
||||
"capacity to: %d. (Original data is copied. No data loss!)",
|
||||
n, size, n + size, capacity, new_capacity);
|
||||
#endif
|
||||
Resize(new_capacity);
|
||||
|
||||
capacity = new_capacity;
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <strstream>
|
||||
@@ -159,17 +160,26 @@ std::vector<TokenIDs> Lexicon::ConvertTextToTokenIdsChinese(
|
||||
words = ProcessHeteronyms(words);
|
||||
|
||||
if (debug_) {
|
||||
fprintf(stderr, "Input text in string: %s\n", text.c_str());
|
||||
fprintf(stderr, "Input text in bytes:");
|
||||
std::ostringstream os;
|
||||
|
||||
os << "Input text in string: " << text << "\n";
|
||||
os << "Input text in bytes:";
|
||||
for (uint8_t c : text) {
|
||||
fprintf(stderr, " %02x", c);
|
||||
os << " 0x" << std::setfill('0') << std::setw(2) << std::right << std::hex
|
||||
<< c;
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "After splitting to words:");
|
||||
os << "\n";
|
||||
os << "After splitting to words:";
|
||||
for (const auto &w : words) {
|
||||
fprintf(stderr, " %s", w.c_str());
|
||||
os << " " << w;
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
os << "\n";
|
||||
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("%{public}s", os.str().c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("%s", os.str().c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> ans;
|
||||
@@ -259,17 +269,26 @@ std::vector<TokenIDs> Lexicon::ConvertTextToTokenIdsNotChinese(
|
||||
std::vector<std::string> words = SplitUtf8(text);
|
||||
|
||||
if (debug_) {
|
||||
fprintf(stderr, "Input text (lowercase) in string: %s\n", text.c_str());
|
||||
fprintf(stderr, "Input text in bytes:");
|
||||
std::ostringstream os;
|
||||
|
||||
os << "Input text (lowercase) in string: " << text << "\n";
|
||||
os << "Input text in bytes:";
|
||||
for (uint8_t c : text) {
|
||||
fprintf(stderr, " %02x", c);
|
||||
os << " 0x" << std::setfill('0') << std::setw(2) << std::right << std::hex
|
||||
<< c;
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "After splitting to words:");
|
||||
os << "\n";
|
||||
os << "After splitting to words:";
|
||||
for (const auto &w : words) {
|
||||
fprintf(stderr, " %s", w.c_str());
|
||||
os << " " << w;
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
os << "\n";
|
||||
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("%{public}s", os.str().c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("%s", os.str().c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
int32_t blank = token2id_.at(" ");
|
||||
|
||||
@@ -6,11 +6,21 @@
|
||||
|
||||
#include <fstream>
|
||||
#include <regex> // NOLINT
|
||||
#include <strstream>
|
||||
#include <utility>
|
||||
#if __ANDROID_API__ >= 9
|
||||
#include "android/asset_manager.h"
|
||||
#include "android/asset_manager_jni.h"
|
||||
#endif
|
||||
|
||||
#if __OHOS__
|
||||
#include "rawfile/raw_file_manager.h"
|
||||
#endif
|
||||
|
||||
#include "cppjieba/Jieba.hpp"
|
||||
#include "sherpa-onnx/csrc/file-utils.h"
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
#include "sherpa-onnx/csrc/onnx-utils.h"
|
||||
#include "sherpa-onnx/csrc/symbol-table.h"
|
||||
#include "sherpa-onnx/csrc/text-utils.h"
|
||||
|
||||
@@ -62,6 +72,60 @@ class MeloTtsLexicon::Impl {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Manager>
|
||||
Impl(Manager *mgr, const std::string &lexicon, const std::string &tokens,
|
||||
const std::string &dict_dir,
|
||||
const OfflineTtsVitsModelMetaData &meta_data, bool debug)
|
||||
: meta_data_(meta_data), debug_(debug) {
|
||||
std::string dict = dict_dir + "/jieba.dict.utf8";
|
||||
std::string hmm = dict_dir + "/hmm_model.utf8";
|
||||
std::string user_dict = dict_dir + "/user.dict.utf8";
|
||||
std::string idf = dict_dir + "/idf.utf8";
|
||||
std::string stop_word = dict_dir + "/stop_words.utf8";
|
||||
|
||||
AssertFileExists(dict);
|
||||
AssertFileExists(hmm);
|
||||
AssertFileExists(user_dict);
|
||||
AssertFileExists(idf);
|
||||
AssertFileExists(stop_word);
|
||||
|
||||
jieba_ =
|
||||
std::make_unique<cppjieba::Jieba>(dict, hmm, user_dict, idf, stop_word);
|
||||
|
||||
{
|
||||
auto buf = ReadFile(mgr, tokens);
|
||||
|
||||
std::istrstream is(buf.data(), buf.size());
|
||||
InitTokens(is);
|
||||
}
|
||||
|
||||
{
|
||||
auto buf = ReadFile(mgr, lexicon);
|
||||
|
||||
std::istrstream is(buf.data(), buf.size());
|
||||
InitLexicon(is);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Manager>
|
||||
Impl(Manager *mgr, const std::string &lexicon, const std::string &tokens,
|
||||
const OfflineTtsVitsModelMetaData &meta_data, bool debug)
|
||||
: meta_data_(meta_data), debug_(debug) {
|
||||
{
|
||||
auto buf = ReadFile(mgr, tokens);
|
||||
|
||||
std::istrstream is(buf.data(), buf.size());
|
||||
InitTokens(is);
|
||||
}
|
||||
|
||||
{
|
||||
auto buf = ReadFile(mgr, lexicon);
|
||||
|
||||
std::istrstream is(buf.data(), buf.size());
|
||||
InitLexicon(is);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text) const {
|
||||
std::string text = ToLowerCase(_text);
|
||||
// see
|
||||
@@ -84,17 +148,24 @@ class MeloTtsLexicon::Impl {
|
||||
jieba_->Cut(text, words, is_hmm);
|
||||
|
||||
if (debug_) {
|
||||
SHERPA_ONNX_LOGE("input text: %s", text.c_str());
|
||||
SHERPA_ONNX_LOGE("after replacing punctuations: %s", s.c_str());
|
||||
|
||||
std::ostringstream os;
|
||||
std::string sep = "";
|
||||
for (const auto &w : words) {
|
||||
os << sep << w;
|
||||
sep = "_";
|
||||
}
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("input text: %{public}s", text.c_str());
|
||||
SHERPA_ONNX_LOGE("after replacing punctuations: %{public}s", s.c_str());
|
||||
|
||||
SHERPA_ONNX_LOGE("after jieba processing: %{public}s",
|
||||
os.str().c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("input text: %s", text.c_str());
|
||||
SHERPA_ONNX_LOGE("after replacing punctuations: %s", s.c_str());
|
||||
|
||||
SHERPA_ONNX_LOGE("after jieba processing: %s", os.str().c_str());
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
words = SplitUtf8(text);
|
||||
@@ -102,7 +173,7 @@ class MeloTtsLexicon::Impl {
|
||||
if (debug_) {
|
||||
fprintf(stderr, "Input text in string (lowercase): %s\n", text.c_str());
|
||||
fprintf(stderr, "Input text in bytes (lowercase):");
|
||||
for (uint8_t c : text) {
|
||||
for (int8_t c : text) {
|
||||
fprintf(stderr, " %02x", c);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
@@ -307,9 +378,48 @@ MeloTtsLexicon::MeloTtsLexicon(const std::string &lexicon,
|
||||
bool debug)
|
||||
: impl_(std::make_unique<Impl>(lexicon, tokens, meta_data, debug)) {}
|
||||
|
||||
template <typename Manager>
|
||||
MeloTtsLexicon::MeloTtsLexicon(Manager *mgr, const std::string &lexicon,
|
||||
const std::string &tokens,
|
||||
const std::string &dict_dir,
|
||||
const OfflineTtsVitsModelMetaData &meta_data,
|
||||
bool debug)
|
||||
: impl_(std::make_unique<Impl>(mgr, lexicon, tokens, dict_dir, meta_data,
|
||||
debug)) {}
|
||||
|
||||
template <typename Manager>
|
||||
MeloTtsLexicon::MeloTtsLexicon(Manager *mgr, const std::string &lexicon,
|
||||
const std::string &tokens,
|
||||
const OfflineTtsVitsModelMetaData &meta_data,
|
||||
bool debug)
|
||||
: impl_(std::make_unique<Impl>(mgr, lexicon, tokens, meta_data, debug)) {}
|
||||
|
||||
std::vector<TokenIDs> MeloTtsLexicon::ConvertTextToTokenIds(
|
||||
const std::string &text, const std::string & /*unused_voice = ""*/) const {
|
||||
return impl_->ConvertTextToTokenIds(text);
|
||||
}
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
template MeloTtsLexicon::MeloTtsLexicon(
|
||||
AAssetManager *mgr, const std::string &lexicon, const std::string &tokens,
|
||||
const std::string &dict_dir, const OfflineTtsVitsModelMetaData &meta_data,
|
||||
bool debug);
|
||||
|
||||
template MeloTtsLexicon::MeloTtsLexicon(
|
||||
AAssetManager *mgr, const std::string &lexicon, const std::string &tokens,
|
||||
const OfflineTtsVitsModelMetaData &meta_data, bool debug);
|
||||
#endif
|
||||
|
||||
#if __OHOS__
|
||||
template MeloTtsLexicon::MeloTtsLexicon(
|
||||
NativeResourceManager *mgr, const std::string &lexicon,
|
||||
const std::string &tokens, const std::string &dict_dir,
|
||||
const OfflineTtsVitsModelMetaData &meta_data, bool debug);
|
||||
|
||||
template MeloTtsLexicon::MeloTtsLexicon(
|
||||
NativeResourceManager *mgr, const std::string &lexicon,
|
||||
const std::string &tokens, const OfflineTtsVitsModelMetaData &meta_data,
|
||||
bool debug);
|
||||
#endif
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -25,6 +25,16 @@ class MeloTtsLexicon : public OfflineTtsFrontend {
|
||||
MeloTtsLexicon(const std::string &lexicon, const std::string &tokens,
|
||||
const OfflineTtsVitsModelMetaData &meta_data, bool debug);
|
||||
|
||||
template <typename Manager>
|
||||
MeloTtsLexicon(Manager *mgr, const std::string &lexicon,
|
||||
const std::string &tokens, const std::string &dict_dir,
|
||||
const OfflineTtsVitsModelMetaData &meta_data, bool debug);
|
||||
|
||||
template <typename Manager>
|
||||
MeloTtsLexicon(Manager *mgr, const std::string &lexicon,
|
||||
const std::string &tokens,
|
||||
const OfflineTtsVitsModelMetaData &meta_data, bool debug);
|
||||
|
||||
std::vector<TokenIDs> ConvertTextToTokenIds(
|
||||
const std::string &text,
|
||||
const std::string &unused_voice = "") const override;
|
||||
|
||||
@@ -40,7 +40,11 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
tn_list_.reserve(files.size());
|
||||
for (const auto &f : files) {
|
||||
if (config.model.debug) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("rule fst: %s", f.c_str());
|
||||
#endif
|
||||
}
|
||||
tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(f));
|
||||
}
|
||||
@@ -57,7 +61,11 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
|
||||
for (const auto &f : files) {
|
||||
if (config.model.debug) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str());
|
||||
#endif
|
||||
}
|
||||
std::unique_ptr<fst::FarReader<fst::StdArc>> reader(
|
||||
fst::FarReader<fst::StdArc>::Open(f));
|
||||
@@ -88,7 +96,11 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
tn_list_.reserve(files.size());
|
||||
for (const auto &f : files) {
|
||||
if (config.model.debug) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("rule fst: %s", f.c_str());
|
||||
#endif
|
||||
}
|
||||
auto buf = ReadFile(mgr, f);
|
||||
std::istrstream is(buf.data(), buf.size());
|
||||
@@ -103,7 +115,11 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
|
||||
for (const auto &f : files) {
|
||||
if (config.model.debug) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
auto buf = ReadFile(mgr, f);
|
||||
@@ -156,14 +172,22 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
|
||||
std::string text = _text;
|
||||
if (config_.model.debug) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("Raw text: %{public}s", text.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("Raw text: %s", text.c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
if (!tn_list_.empty()) {
|
||||
for (const auto &tn : tn_list_) {
|
||||
text = tn->Normalize(text);
|
||||
if (config_.model.debug) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("After normalizing: %{public}s", text.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("After normalizing: %s", text.c_str());
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -226,10 +250,17 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
int32_t num_batches = x_size / batch_size;
|
||||
|
||||
if (config_.model.debug) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Text is too long. Split it into %{public}d batches. batch size: "
|
||||
"%{public}d. Number of sentences: %{public}d",
|
||||
num_batches, batch_size, x_size);
|
||||
#else
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Text is too long. Split it into %d batches. batch size: %d. Number "
|
||||
"of sentences: %d",
|
||||
num_batches, batch_size, x_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
GeneratedAudio ans;
|
||||
@@ -255,7 +286,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
audio.samples.end());
|
||||
if (callback) {
|
||||
should_continue = callback(audio.samples.data(), audio.samples.size(),
|
||||
b * 1.0 / num_batches);
|
||||
(b + 1) * 1.0 / num_batches);
|
||||
// Caution(fangjun): audio is freed when the callback returns, so users
|
||||
// should copy the data if they want to access the data after
|
||||
// the callback returns to avoid segmentation fault.
|
||||
@@ -297,6 +328,16 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
if (meta_data.frontend == "characters") {
|
||||
frontend_ = std::make_unique<OfflineTtsCharacterFrontend>(
|
||||
mgr, config_.model.vits.tokens, meta_data);
|
||||
} else if (meta_data.jieba && !config_.model.vits.dict_dir.empty() &&
|
||||
meta_data.is_melo_tts) {
|
||||
frontend_ = std::make_unique<MeloTtsLexicon>(
|
||||
mgr, config_.model.vits.lexicon, config_.model.vits.tokens,
|
||||
config_.model.vits.dict_dir, model_->GetMetaData(),
|
||||
config_.model.debug);
|
||||
} else if (meta_data.is_melo_tts && meta_data.language == "English") {
|
||||
frontend_ = std::make_unique<MeloTtsLexicon>(
|
||||
mgr, config_.model.vits.lexicon, config_.model.vits.tokens,
|
||||
model_->GetMetaData(), config_.model.debug);
|
||||
} else if ((meta_data.is_piper || meta_data.is_coqui ||
|
||||
meta_data.is_icefall) &&
|
||||
!config_.model.vits.data_dir.empty()) {
|
||||
|
||||
@@ -144,7 +144,11 @@ class OfflineTtsVitsModel::Impl {
|
||||
++i;
|
||||
}
|
||||
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("%s\n", os.str().c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
Ort::AllocatorWithDefaultOptions allocator; // used in the macro below
|
||||
|
||||
Reference in New Issue
Block a user