Use piper-phonemize to convert text to token IDs (#453)
This commit is contained in:
@@ -547,6 +547,8 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
|
||||
tts_config.model.vits.lexicon =
|
||||
SHERPA_ONNX_OR(config->model.vits.lexicon, "");
|
||||
tts_config.model.vits.tokens = SHERPA_ONNX_OR(config->model.vits.tokens, "");
|
||||
tts_config.model.vits.data_dir =
|
||||
SHERPA_ONNX_OR(config->model.vits.data_dir, "");
|
||||
tts_config.model.vits.noise_scale =
|
||||
SHERPA_ONNX_OR(config->model.vits.noise_scale, 0.667);
|
||||
tts_config.model.vits.noise_scale_w =
|
||||
@@ -558,6 +560,7 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
|
||||
tts_config.model.debug = config->model.debug;
|
||||
tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu");
|
||||
tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, "");
|
||||
tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 2);
|
||||
|
||||
if (tts_config.model.debug) {
|
||||
fprintf(stderr, "%s\n", tts_config.ToString().c_str());
|
||||
|
||||
@@ -607,6 +607,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig {
|
||||
const char *model;
|
||||
const char *lexicon;
|
||||
const char *tokens;
|
||||
const char *data_dir;
|
||||
|
||||
float noise_scale;
|
||||
float noise_scale_w;
|
||||
@@ -623,6 +624,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig {
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig {
|
||||
SherpaOnnxOfflineTtsModelConfig model;
|
||||
const char *rule_fsts;
|
||||
int32_t max_num_sentences;
|
||||
} SherpaOnnxOfflineTtsConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio {
|
||||
|
||||
@@ -74,6 +74,7 @@ set(sources
|
||||
packed-sequence.cc
|
||||
pad-sequence.cc
|
||||
parse-options.cc
|
||||
piper-phonemize-lexicon.cc
|
||||
provider.cc
|
||||
resample.cc
|
||||
session.cc
|
||||
|
||||
@@ -129,8 +129,8 @@ Lexicon::Lexicon(AAssetManager *mgr, const std::string &lexicon,
|
||||
}
|
||||
#endif
|
||||
|
||||
std::vector<int64_t> Lexicon::ConvertTextToTokenIds(
|
||||
const std::string &text) const {
|
||||
std::vector<std::vector<int64_t>> Lexicon::ConvertTextToTokenIds(
|
||||
const std::string &text, const std::string & /*voice*/ /*= ""*/) const {
|
||||
switch (language_) {
|
||||
case Language::kEnglish:
|
||||
return ConvertTextToTokenIdsEnglish(text);
|
||||
@@ -150,7 +150,7 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds(
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<int64_t> Lexicon::ConvertTextToTokenIdsChinese(
|
||||
std::vector<std::vector<int64_t>> Lexicon::ConvertTextToTokenIdsChinese(
|
||||
const std::string &text) const {
|
||||
std::vector<std::string> words;
|
||||
if (pattern_) {
|
||||
@@ -245,10 +245,10 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsChinese(
|
||||
ans.push_back(eos);
|
||||
}
|
||||
|
||||
return ans;
|
||||
return {ans};
|
||||
}
|
||||
|
||||
std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish(
|
||||
std::vector<std::vector<int64_t>> Lexicon::ConvertTextToTokenIdsEnglish(
|
||||
const std::string &_text) const {
|
||||
std::string text(_text);
|
||||
ToLowerCase(&text);
|
||||
@@ -301,7 +301,7 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish(
|
||||
ans.push_back(token2id_.at("$")); // eos
|
||||
}
|
||||
|
||||
return ans;
|
||||
return {ans};
|
||||
}
|
||||
|
||||
void Lexicon::InitTokens(std::istream &is) { token2id_ = ReadTokens(is); }
|
||||
|
||||
@@ -18,11 +18,15 @@
|
||||
#include "android/asset_manager_jni.h"
|
||||
#endif
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
// TODO(fangjun): Refactor it to an abstract class
|
||||
class Lexicon {
|
||||
class Lexicon : public OfflineTtsFrontend {
|
||||
public:
|
||||
Lexicon() = default; // for subclasses
|
||||
//
|
||||
// Note: for models from piper, we won't use this class.
|
||||
Lexicon(const std::string &lexicon, const std::string &tokens,
|
||||
const std::string &punctuations, const std::string &language,
|
||||
bool debug = false, bool is_piper = false);
|
||||
@@ -34,28 +38,29 @@ class Lexicon {
|
||||
bool is_piper = false);
|
||||
#endif
|
||||
|
||||
std::vector<int64_t> ConvertTextToTokenIds(const std::string &text) const;
|
||||
std::vector<std::vector<int64_t>> ConvertTextToTokenIds(
|
||||
const std::string &text, const std::string &voice = "") const override;
|
||||
|
||||
private:
|
||||
std::vector<int64_t> ConvertTextToTokenIdsGerman(
|
||||
std::vector<std::vector<int64_t>> ConvertTextToTokenIdsGerman(
|
||||
const std::string &text) const {
|
||||
return ConvertTextToTokenIdsEnglish(text);
|
||||
}
|
||||
|
||||
std::vector<int64_t> ConvertTextToTokenIdsSpanish(
|
||||
std::vector<std::vector<int64_t>> ConvertTextToTokenIdsSpanish(
|
||||
const std::string &text) const {
|
||||
return ConvertTextToTokenIdsEnglish(text);
|
||||
}
|
||||
|
||||
std::vector<int64_t> ConvertTextToTokenIdsFrench(
|
||||
std::vector<std::vector<int64_t>> ConvertTextToTokenIdsFrench(
|
||||
const std::string &text) const {
|
||||
return ConvertTextToTokenIdsEnglish(text);
|
||||
}
|
||||
|
||||
std::vector<int64_t> ConvertTextToTokenIdsEnglish(
|
||||
std::vector<std::vector<int64_t>> ConvertTextToTokenIdsEnglish(
|
||||
const std::string &text) const;
|
||||
|
||||
std::vector<int64_t> ConvertTextToTokenIdsChinese(
|
||||
std::vector<std::vector<int64_t>> ConvertTextToTokenIdsChinese(
|
||||
const std::string &text) const;
|
||||
|
||||
void InitLanguage(const std::string &lang);
|
||||
|
||||
@@ -43,6 +43,21 @@
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(dst, src_key, default_value) \
|
||||
do { \
|
||||
auto value = \
|
||||
meta_data.LookupCustomMetadataMapAllocated(src_key, allocator); \
|
||||
if (!value) { \
|
||||
dst = default_value; \
|
||||
} else { \
|
||||
dst = atoi(value.get()); \
|
||||
if (dst < 0) { \
|
||||
SHERPA_ONNX_LOGE("Invalid value %d for %s", dst, src_key); \
|
||||
exit(-1); \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// read a vector of integers
|
||||
#define SHERPA_ONNX_READ_META_DATA_VEC(dst, src_key) \
|
||||
do { \
|
||||
@@ -112,4 +127,20 @@
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(dst, src_key, \
|
||||
default_value) \
|
||||
do { \
|
||||
auto value = \
|
||||
meta_data.LookupCustomMetadataMapAllocated(src_key, allocator); \
|
||||
if (!value) { \
|
||||
dst = default_value; \
|
||||
} else { \
|
||||
dst = value.get(); \
|
||||
if (dst.empty()) { \
|
||||
SHERPA_ONNX_LOGE("Invalid value for %s\n", src_key); \
|
||||
exit(-1); \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_MACROS_H_
|
||||
|
||||
35
sherpa-onnx/csrc/offline-tts-frontend.h
Normal file
35
sherpa-onnx/csrc/offline-tts-frontend.h
Normal file
@@ -0,0 +1,35 @@
|
||||
// sherpa-onnx/csrc/offline-tts-frontend.h
|
||||
//
|
||||
// Copyright (c) 2023 Xiaomi Corporation
|
||||
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
class OfflineTtsFrontend {
|
||||
public:
|
||||
virtual ~OfflineTtsFrontend() = default;
|
||||
|
||||
/** Convert a string to token IDs.
|
||||
*
|
||||
* @param text The input text.
|
||||
* Example 1: "This is the first sample sentence; this is the
|
||||
* second one." Example 2: "这是第一句。这是第二句。"
|
||||
* @param voice Optional. It is for espeak-ng.
|
||||
*
|
||||
* @return Return a vector-of-vector of token IDs. Each subvector contains
|
||||
* a sentence that can be processed independently.
|
||||
* If a frontend does not support splitting the text into sentences,
|
||||
* the resulting vector contains only one subvector.
|
||||
*/
|
||||
virtual std::vector<std::vector<int64_t>> ConvertTextToTokenIds(
|
||||
const std::string &text, const std::string &voice = "") const = 0;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
|
||||
@@ -18,9 +18,11 @@
|
||||
#include "kaldifst/csrc/text-normalizer.h"
|
||||
#include "sherpa-onnx/csrc/lexicon.h"
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-impl.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-vits-model.h"
|
||||
#include "sherpa-onnx/csrc/onnx-utils.h"
|
||||
#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h"
|
||||
#include "sherpa-onnx/csrc/text-utils.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
@@ -29,10 +31,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
public:
|
||||
explicit OfflineTtsVitsImpl(const OfflineTtsConfig &config)
|
||||
: config_(config),
|
||||
model_(std::make_unique<OfflineTtsVitsModel>(config.model)),
|
||||
lexicon_(config.model.vits.lexicon, config.model.vits.tokens,
|
||||
model_->Punctuations(), model_->Language(), config.model.debug,
|
||||
model_->IsPiper()) {
|
||||
model_(std::make_unique<OfflineTtsVitsModel>(config.model)) {
|
||||
InitFrontend();
|
||||
|
||||
if (!config.rule_fsts.empty()) {
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(config.rule_fsts, ",", false, &files);
|
||||
@@ -49,10 +50,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
#if __ANDROID_API__ >= 9
|
||||
OfflineTtsVitsImpl(AAssetManager *mgr, const OfflineTtsConfig &config)
|
||||
: config_(config),
|
||||
model_(std::make_unique<OfflineTtsVitsModel>(mgr, config.model)),
|
||||
lexicon_(mgr, config.model.vits.lexicon, config.model.vits.tokens,
|
||||
model_->Punctuations(), model_->Language(), config.model.debug,
|
||||
model_->IsPiper()) {
|
||||
model_(std::make_unique<OfflineTtsVitsModel>(mgr, config.model)) {
|
||||
InitFrontend(mgr);
|
||||
|
||||
if (!config.rule_fsts.empty()) {
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(config.rule_fsts, ",", false, &files);
|
||||
@@ -101,20 +101,119 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<int64_t> x = lexicon_.ConvertTextToTokenIds(text);
|
||||
if (x.empty()) {
|
||||
std::vector<std::vector<int64_t>> x =
|
||||
frontend_->ConvertTextToTokenIds(text, model_->Voice());
|
||||
|
||||
if (x.empty() || (x.size() == 1 && x[0].empty())) {
|
||||
SHERPA_ONNX_LOGE("Failed to convert %s to token IDs", text.c_str());
|
||||
return {};
|
||||
}
|
||||
|
||||
if (model_->AddBlank()) {
|
||||
std::vector<int64_t> buffer(x.size() * 2 + 1);
|
||||
int32_t i = 1;
|
||||
for (auto k : x) {
|
||||
buffer[i] = k;
|
||||
i += 2;
|
||||
if (model_->AddBlank() && config_.model.vits.data_dir.empty()) {
|
||||
for (auto &k : x) {
|
||||
k = AddBlank(k);
|
||||
}
|
||||
x = std::move(buffer);
|
||||
}
|
||||
|
||||
int32_t x_size = static_cast<int32_t>(x.size());
|
||||
|
||||
if (config_.max_num_sentences <= 0 || x_size <= config_.max_num_sentences) {
|
||||
return Process(x, sid, speed);
|
||||
}
|
||||
|
||||
// the input text is too long, we process sentences within it in batches
|
||||
// to avoid OOM. Batch size is config_.max_num_sentences
|
||||
std::vector<std::vector<int64_t>> batch;
|
||||
int32_t batch_size = config_.max_num_sentences;
|
||||
batch.reserve(config_.max_num_sentences);
|
||||
int32_t num_batches = x_size / batch_size;
|
||||
|
||||
if (config_.model.debug) {
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Text is too long. Split it into %d batches. batch size: %d. Number "
|
||||
"of sentences: %d",
|
||||
num_batches, batch_size, x_size);
|
||||
}
|
||||
|
||||
GeneratedAudio ans;
|
||||
|
||||
int32_t k = 0;
|
||||
|
||||
for (int32_t b = 0; b != num_batches; ++b) {
|
||||
batch.clear();
|
||||
for (int32_t i = 0; i != batch_size; ++i, ++k) {
|
||||
batch.push_back(std::move(x[k]));
|
||||
}
|
||||
|
||||
auto audio = Process(batch, sid, speed);
|
||||
ans.sample_rate = audio.sample_rate;
|
||||
ans.samples.insert(ans.samples.end(), audio.samples.begin(),
|
||||
audio.samples.end());
|
||||
}
|
||||
|
||||
batch.clear();
|
||||
while (k < x.size()) {
|
||||
batch.push_back(std::move(x[k]));
|
||||
++k;
|
||||
}
|
||||
|
||||
if (!batch.empty()) {
|
||||
auto audio = Process(batch, sid, speed);
|
||||
ans.sample_rate = audio.sample_rate;
|
||||
ans.samples.insert(ans.samples.end(), audio.samples.begin(),
|
||||
audio.samples.end());
|
||||
}
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
private:
|
||||
void InitFrontend(AAssetManager *mgr) {
|
||||
if (model_->IsPiper() && !config_.model.vits.data_dir.empty()) {
|
||||
frontend_ = std::make_unique<PiperPhonemizeLexicon>(
|
||||
mgr, config_.model.vits.tokens, config_.model.vits.data_dir);
|
||||
} else {
|
||||
frontend_ = std::make_unique<Lexicon>(
|
||||
mgr, config_.model.vits.lexicon, config_.model.vits.tokens,
|
||||
model_->Punctuations(), model_->Language(), config_.model.debug,
|
||||
model_->IsPiper());
|
||||
}
|
||||
}
|
||||
|
||||
void InitFrontend() {
|
||||
if (model_->IsPiper() && !config_.model.vits.data_dir.empty()) {
|
||||
frontend_ = std::make_unique<PiperPhonemizeLexicon>(
|
||||
config_.model.vits.tokens, config_.model.vits.data_dir);
|
||||
} else {
|
||||
frontend_ = std::make_unique<Lexicon>(
|
||||
config_.model.vits.lexicon, config_.model.vits.tokens,
|
||||
model_->Punctuations(), model_->Language(), config_.model.debug,
|
||||
model_->IsPiper());
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<int64_t> AddBlank(const std::vector<int64_t> &x) const {
|
||||
// we assume the blank ID is 0
|
||||
std::vector<int64_t> buffer(x.size() * 2 + 1);
|
||||
int32_t i = 1;
|
||||
for (auto k : x) {
|
||||
buffer[i] = k;
|
||||
i += 2;
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens,
|
||||
int32_t sid, float speed) const {
|
||||
int32_t num_tokens = 0;
|
||||
for (const auto &k : tokens) {
|
||||
num_tokens += k.size();
|
||||
}
|
||||
|
||||
std::vector<int64_t> x;
|
||||
x.reserve(num_tokens);
|
||||
for (const auto &k : tokens) {
|
||||
x.insert(x.end(), k.begin(), k.end());
|
||||
}
|
||||
|
||||
auto memory_info =
|
||||
@@ -147,7 +246,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
|
||||
OfflineTtsConfig config_;
|
||||
std::unique_ptr<OfflineTtsVitsModel> model_;
|
||||
std::vector<std::unique_ptr<kaldifst::TextNormalizer>> tn_list_;
|
||||
Lexicon lexicon_;
|
||||
std::unique_ptr<OfflineTtsFrontend> frontend_;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -13,6 +13,9 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) {
|
||||
po->Register("vits-model", &model, "Path to VITS model");
|
||||
po->Register("vits-lexicon", &lexicon, "Path to lexicon.txt for VITS models");
|
||||
po->Register("vits-tokens", &tokens, "Path to tokens.txt for VITS models");
|
||||
po->Register("vits-data-dir", &data_dir,
|
||||
"Path to the directory containing dict for espeak-ng. If it is "
|
||||
"given, --vits-lexicon is ignored.");
|
||||
po->Register("vits-noise-scale", &noise_scale, "noise_scale for VITS models");
|
||||
po->Register("vits-noise-scale-w", &noise_scale_w,
|
||||
"noise_scale_w for VITS models");
|
||||
@@ -31,16 +34,6 @@ bool OfflineTtsVitsModelConfig::Validate() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (lexicon.empty()) {
|
||||
SHERPA_ONNX_LOGE("Please provide --vits-lexicon");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!FileExists(lexicon)) {
|
||||
SHERPA_ONNX_LOGE("--vits-lexicon: %s does not exist", lexicon.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (tokens.empty()) {
|
||||
SHERPA_ONNX_LOGE("Please provide --vits-tokens");
|
||||
return false;
|
||||
@@ -51,6 +44,43 @@ bool OfflineTtsVitsModelConfig::Validate() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (data_dir.empty()) {
|
||||
if (lexicon.empty()) {
|
||||
SHERPA_ONNX_LOGE("Please provide --vits-lexicon");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!FileExists(lexicon)) {
|
||||
SHERPA_ONNX_LOGE("--vits-lexicon: %s does not exist", lexicon.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
} else {
|
||||
if (!FileExists(data_dir + "/phontab")) {
|
||||
SHERPA_ONNX_LOGE("%s/phontab does not exist. Skipping test",
|
||||
data_dir.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!FileExists(data_dir + "/phonindex")) {
|
||||
SHERPA_ONNX_LOGE("%s/phonindex does not exist. Skipping test",
|
||||
data_dir.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!FileExists(data_dir + "/phondata")) {
|
||||
SHERPA_ONNX_LOGE("%s/phondata does not exist. Skipping test",
|
||||
data_dir.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!FileExists(data_dir + "/intonations")) {
|
||||
SHERPA_ONNX_LOGE("%s/intonations does not exist. Skipping test",
|
||||
data_dir.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -61,6 +91,7 @@ std::string OfflineTtsVitsModelConfig::ToString() const {
|
||||
os << "model=\"" << model << "\", ";
|
||||
os << "lexicon=\"" << lexicon << "\", ";
|
||||
os << "tokens=\"" << tokens << "\", ";
|
||||
os << "data_dir=\"" << data_dir << "\", ";
|
||||
os << "noise_scale=" << noise_scale << ", ";
|
||||
os << "noise_scale_w=" << noise_scale_w << ", ";
|
||||
os << "length_scale=" << length_scale << ")";
|
||||
|
||||
@@ -16,6 +16,10 @@ struct OfflineTtsVitsModelConfig {
|
||||
std::string lexicon;
|
||||
std::string tokens;
|
||||
|
||||
// If data_dir is given, lexicon is ignored
|
||||
// data_dir is for piper-phonemize, which uses espeak-ng
|
||||
std::string data_dir;
|
||||
|
||||
float noise_scale = 0.667;
|
||||
float noise_scale_w = 0.8;
|
||||
float length_scale = 1;
|
||||
@@ -28,11 +32,13 @@ struct OfflineTtsVitsModelConfig {
|
||||
OfflineTtsVitsModelConfig(const std::string &model,
|
||||
const std::string &lexicon,
|
||||
const std::string &tokens,
|
||||
const std::string &data_dir,
|
||||
float noise_scale = 0.667,
|
||||
float noise_scale_w = 0.8, float length_scale = 1)
|
||||
: model(model),
|
||||
lexicon(lexicon),
|
||||
tokens(tokens),
|
||||
data_dir(data_dir),
|
||||
noise_scale(noise_scale),
|
||||
noise_scale_w(noise_scale_w),
|
||||
length_scale(length_scale) {}
|
||||
|
||||
@@ -51,6 +51,7 @@ class OfflineTtsVitsModel::Impl {
|
||||
|
||||
std::string Punctuations() const { return punctuations_; }
|
||||
std::string Language() const { return language_; }
|
||||
std::string Voice() const { return voice_; }
|
||||
bool IsPiper() const { return is_piper_; }
|
||||
int32_t NumSpeakers() const { return num_speakers_; }
|
||||
|
||||
@@ -74,10 +75,12 @@ class OfflineTtsVitsModel::Impl {
|
||||
|
||||
Ort::AllocatorWithDefaultOptions allocator; // used in the macro below
|
||||
SHERPA_ONNX_READ_META_DATA(sample_rate_, "sample_rate");
|
||||
SHERPA_ONNX_READ_META_DATA(add_blank_, "add_blank");
|
||||
SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(add_blank_, "add_blank", 0);
|
||||
SHERPA_ONNX_READ_META_DATA(num_speakers_, "n_speakers");
|
||||
SHERPA_ONNX_READ_META_DATA_STR(punctuations_, "punctuation");
|
||||
SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(punctuations_, "punctuation",
|
||||
"");
|
||||
SHERPA_ONNX_READ_META_DATA_STR(language_, "language");
|
||||
SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(voice_, "voice", "");
|
||||
|
||||
std::string comment;
|
||||
SHERPA_ONNX_READ_META_DATA_STR(comment, "comment");
|
||||
@@ -215,6 +218,7 @@ class OfflineTtsVitsModel::Impl {
|
||||
int32_t num_speakers_;
|
||||
std::string punctuations_;
|
||||
std::string language_;
|
||||
std::string voice_;
|
||||
|
||||
bool is_piper_ = false;
|
||||
};
|
||||
@@ -244,6 +248,7 @@ std::string OfflineTtsVitsModel::Punctuations() const {
|
||||
}
|
||||
|
||||
std::string OfflineTtsVitsModel::Language() const { return impl_->Language(); }
|
||||
std::string OfflineTtsVitsModel::Voice() const { return impl_->Voice(); }
|
||||
|
||||
bool OfflineTtsVitsModel::IsPiper() const { return impl_->IsPiper(); }
|
||||
|
||||
|
||||
@@ -46,7 +46,8 @@ class OfflineTtsVitsModel {
|
||||
bool AddBlank() const;
|
||||
|
||||
std::string Punctuations() const;
|
||||
std::string Language() const;
|
||||
std::string Language() const; // e.g., Chinese, English, German, etc.
|
||||
std::string Voice() const; // e.g., en-us, for espeak-ng
|
||||
bool IsPiper() const;
|
||||
int32_t NumSpeakers() const;
|
||||
|
||||
|
||||
@@ -21,6 +21,12 @@ void OfflineTtsConfig::Register(ParseOptions *po) {
|
||||
"Multiple filenames are separated by a comma and they are "
|
||||
"applied from left to right. An example value: "
|
||||
"rule1.fst,rule2,fst,rule3.fst");
|
||||
|
||||
po->Register(
|
||||
"tts-max-num-sentences", &max_num_sentences,
|
||||
"Maximum number of sentences that we process at a time. "
|
||||
"This is to avoid OOM for very long input text. "
|
||||
"If you set it to -1, then we process all sentences in a single batch.");
|
||||
}
|
||||
|
||||
bool OfflineTtsConfig::Validate() const {
|
||||
@@ -43,7 +49,8 @@ std::string OfflineTtsConfig::ToString() const {
|
||||
|
||||
os << "OfflineTtsConfig(";
|
||||
os << "model=" << model.ToString() << ", ";
|
||||
os << "rule_fsts=\"" << rule_fsts << "\")";
|
||||
os << "rule_fsts=\"" << rule_fsts << "\", ";
|
||||
os << "max_num_sentences=" << max_num_sentences << ")";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
@@ -28,10 +28,17 @@ struct OfflineTtsConfig {
|
||||
// If there are multiple rules, they are applied from left to right.
|
||||
std::string rule_fsts;
|
||||
|
||||
// Maximum number of sentences that we process at a time.
|
||||
// This is to avoid OOM for very long input text.
|
||||
// If you set it to -1, then we process all sentences in a single batch.
|
||||
int32_t max_num_sentences = 2;
|
||||
|
||||
OfflineTtsConfig() = default;
|
||||
OfflineTtsConfig(const OfflineTtsModelConfig &model,
|
||||
const std::string &rule_fsts)
|
||||
: model(model), rule_fsts(rule_fsts) {}
|
||||
const std::string &rule_fsts, int32_t max_num_sentences)
|
||||
: model(model),
|
||||
rule_fsts(rule_fsts),
|
||||
max_num_sentences(max_num_sentences) {}
|
||||
|
||||
void Register(ParseOptions *po);
|
||||
bool Validate() const;
|
||||
|
||||
170
sherpa-onnx/csrc/piper-phonemize-lexicon.cc
Normal file
170
sherpa-onnx/csrc/piper-phonemize-lexicon.cc
Normal file
@@ -0,0 +1,170 @@
|
||||
// sherpa-onnx/csrc/piper-phonemize-lexicon.cc
|
||||
//
|
||||
// Copyright (c) 2022-2023 Xiaomi Corporation
|
||||
|
||||
#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h"
|
||||
|
||||
#include <codecvt>
|
||||
#include <fstream>
|
||||
#include <locale>
|
||||
#include <map>
|
||||
#include <mutex> // NOLINT
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
#include <strstream>
|
||||
|
||||
#include "android/asset_manager.h"
|
||||
#include "android/asset_manager_jni.h"
|
||||
#endif
|
||||
|
||||
#include "espeak-ng/speak_lib.h"
|
||||
#include "phoneme_ids.hpp"
|
||||
#include "phonemize.hpp"
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
#include "sherpa-onnx/csrc/onnx-utils.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
static std::unordered_map<char32_t, int32_t> ReadTokens(std::istream &is) {
|
||||
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
|
||||
std::unordered_map<char32_t, int32_t> token2id;
|
||||
|
||||
std::string line;
|
||||
|
||||
std::string sym;
|
||||
std::u32string s;
|
||||
int32_t id;
|
||||
while (std::getline(is, line)) {
|
||||
std::istringstream iss(line);
|
||||
iss >> sym;
|
||||
if (iss.eof()) {
|
||||
id = atoi(sym.c_str());
|
||||
sym = " ";
|
||||
} else {
|
||||
iss >> id;
|
||||
}
|
||||
|
||||
// eat the trailing \r\n on windows
|
||||
iss >> std::ws;
|
||||
if (!iss.eof()) {
|
||||
SHERPA_ONNX_LOGE("Error when reading tokens: %s", line.c_str());
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
s = conv.from_bytes(sym);
|
||||
if (s.size() != 1) {
|
||||
SHERPA_ONNX_LOGE("Error when reading tokens at Line %s. size: %d",
|
||||
line.c_str(), static_cast<int32_t>(s.size()));
|
||||
exit(-1);
|
||||
}
|
||||
char32_t c = s[0];
|
||||
|
||||
if (token2id.count(c)) {
|
||||
SHERPA_ONNX_LOGE("Duplicated token %s. Line %s. Existing ID: %d",
|
||||
sym.c_str(), line.c_str(), token2id.at(c));
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
token2id.insert({c, id});
|
||||
}
|
||||
|
||||
return token2id;
|
||||
}
|
||||
|
||||
// see the function "phonemes_to_ids" from
|
||||
// https://github.com/rhasspy/piper/blob/master/notebooks/piper_inference_(ONNX).ipynb
|
||||
static std::vector<int64_t> PhonemesToIds(
|
||||
const std::unordered_map<char32_t, int32_t> &token2id,
|
||||
const std::vector<piper::Phoneme> &phonemes) {
|
||||
// see
|
||||
// https://github.com/rhasspy/piper-phonemize/blob/master/src/phoneme_ids.hpp#L17
|
||||
int32_t pad = token2id.at(U'_');
|
||||
int32_t bos = token2id.at(U'^');
|
||||
int32_t eos = token2id.at(U'$');
|
||||
|
||||
std::vector<int64_t> ans;
|
||||
ans.reserve(phonemes.size());
|
||||
|
||||
ans.push_back(bos);
|
||||
for (auto p : phonemes) {
|
||||
if (token2id.count(p)) {
|
||||
ans.push_back(token2id.at(p));
|
||||
ans.push_back(pad);
|
||||
} else {
|
||||
SHERPA_ONNX_LOGE("Skip unkown phonemes. Unicode codepoint: \\U+%04x.", p);
|
||||
}
|
||||
}
|
||||
ans.push_back(eos);
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
void InitEspeak(const std::string &data_dir) {
|
||||
static std::once_flag init_flag;
|
||||
std::call_once(init_flag, [data_dir]() {
|
||||
int32_t result =
|
||||
espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, data_dir.c_str(), 0);
|
||||
if (result != 22050) {
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Failed to initialize espeak-ng with data dir: %s. Return code is: "
|
||||
"%d",
|
||||
data_dir.c_str(), result);
|
||||
exit(-1);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
PiperPhonemizeLexicon::PiperPhonemizeLexicon(const std::string &tokens,
|
||||
const std::string &data_dir)
|
||||
: data_dir_(data_dir) {
|
||||
{
|
||||
std::ifstream is(tokens);
|
||||
token2id_ = ReadTokens(is);
|
||||
}
|
||||
|
||||
InitEspeak(data_dir_);
|
||||
}
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
PiperPhonemizeLexicon::PiperPhonemizeLexicon(AAssetManager *mgr,
|
||||
const std::string &tokens,
|
||||
const std::string &data_dir) {
|
||||
{
|
||||
auto buf = ReadFile(mgr, tokens);
|
||||
std::istrstream is(buf.data(), buf.size());
|
||||
token2id_ = ReadTokens(is);
|
||||
}
|
||||
|
||||
// We should copy the directory of espeak-ng-data from the asset to
|
||||
// some internal or external storage and then pass the directory to data_dir.
|
||||
InitEspeak(data_dir_);
|
||||
}
|
||||
#endif
|
||||
|
||||
std::vector<std::vector<int64_t>> PiperPhonemizeLexicon::ConvertTextToTokenIds(
|
||||
const std::string &text, const std::string &voice /*= ""*/) const {
|
||||
piper::eSpeakPhonemeConfig config;
|
||||
|
||||
// ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices
|
||||
// to list available voices
|
||||
config.voice = voice; // e.g., voice is en-us
|
||||
|
||||
std::vector<std::vector<piper::Phoneme>> phonemes;
|
||||
piper::phonemize_eSpeak(text, config, phonemes);
|
||||
|
||||
std::vector<std::vector<int64_t>> ans;
|
||||
|
||||
std::vector<int64_t> phoneme_ids;
|
||||
for (const auto &p : phonemes) {
|
||||
phoneme_ids = PhonemesToIds(token2id_, p);
|
||||
ans.push_back(std::move(phoneme_ids));
|
||||
}
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
41
sherpa-onnx/csrc/piper-phonemize-lexicon.h
Normal file
41
sherpa-onnx/csrc/piper-phonemize-lexicon.h
Normal file
@@ -0,0 +1,41 @@
|
||||
// sherpa-onnx/csrc/piper-phonemize-lexicon.h
|
||||
//
|
||||
// Copyright (c) 2022-2023 Xiaomi Corporation
|
||||
|
||||
#ifndef SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_
|
||||
#define SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_
|
||||
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
#include "android/asset_manager.h"
|
||||
#include "android/asset_manager_jni.h"
|
||||
#endif
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
class PiperPhonemizeLexicon : public OfflineTtsFrontend {
|
||||
public:
|
||||
PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir);
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
PiperPhonemizeLexicon(AAssetManager *mgr, const std::string &tokens,
|
||||
const std::string &data_dir);
|
||||
#endif
|
||||
|
||||
std::vector<std::vector<int64_t>> ConvertTextToTokenIds(
|
||||
const std::string &text, const std::string &voice = "") const override;
|
||||
|
||||
private:
|
||||
std::string data_dir_;
|
||||
// map unicode codepoint to an integer ID
|
||||
std::unordered_map<char32_t, int32_t> token2id_;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_
|
||||
@@ -48,7 +48,7 @@ TEST(PiperPhonemize, Case1) {
|
||||
|
||||
piper::eSpeakPhonemeConfig config;
|
||||
|
||||
// ./bin/espeak-ng --path ./install/share/espeak-ng-data/ --voices
|
||||
// ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices
|
||||
// to list available voices
|
||||
config.voice = "en-us";
|
||||
|
||||
@@ -61,15 +61,15 @@ TEST(PiperPhonemize, Case1) {
|
||||
}
|
||||
std::cout << "\n";
|
||||
|
||||
std::vector<piper::PhonemeId> phonemeIds;
|
||||
std::map<piper::Phoneme, std::size_t> missingPhonemes;
|
||||
std::vector<piper::PhonemeId> phoneme_ids;
|
||||
std::map<piper::Phoneme, std::size_t> missing_phonemes;
|
||||
|
||||
{
|
||||
piper::PhonemeIdConfig config;
|
||||
phonemes_to_ids(phonemes[0], config, phonemeIds, missingPhonemes);
|
||||
phonemes_to_ids(phonemes[0], config, phoneme_ids, missing_phonemes);
|
||||
}
|
||||
|
||||
for (int32_t p : phonemeIds) {
|
||||
for (int32_t p : phoneme_ids) {
|
||||
std::cout << p << " ";
|
||||
}
|
||||
std::cout << "\n";
|
||||
|
||||
@@ -545,6 +545,12 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
|
||||
ans.model.vits.tokens = p;
|
||||
env->ReleaseStringUTFChars(s, p);
|
||||
|
||||
fid = env->GetFieldID(vits_cls, "dataDir", "Ljava/lang/String;");
|
||||
s = (jstring)env->GetObjectField(vits, fid);
|
||||
p = env->GetStringUTFChars(s, nullptr);
|
||||
ans.model.vits.data_dir = p;
|
||||
env->ReleaseStringUTFChars(s, p);
|
||||
|
||||
fid = env->GetFieldID(vits_cls, "noiseScale", "F");
|
||||
ans.model.vits.noise_scale = env->GetFloatField(vits, fid);
|
||||
|
||||
@@ -573,6 +579,9 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
|
||||
ans.rule_fsts = p;
|
||||
env->ReleaseStringUTFChars(s, p);
|
||||
|
||||
fid = env->GetFieldID(cls, "maxNumSentences", "I");
|
||||
ans.max_num_sentences = env->GetIntField(config, fid);
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
@@ -589,6 +598,11 @@ JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_new(
|
||||
#endif
|
||||
auto config = sherpa_onnx::GetOfflineTtsConfig(env, _config);
|
||||
SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str());
|
||||
|
||||
if (!config.Validate()) {
|
||||
SHERPA_ONNX_LOGE("Erros found in config!");
|
||||
}
|
||||
|
||||
auto tts = new sherpa_onnx::SherpaOnnxOfflineTts(
|
||||
#if __ANDROID_API__ >= 9
|
||||
mgr,
|
||||
|
||||
@@ -16,17 +16,20 @@ void PybindOfflineTtsVitsModelConfig(py::module *m) {
|
||||
py::class_<PyClass>(*m, "OfflineTtsVitsModelConfig")
|
||||
.def(py::init<>())
|
||||
.def(py::init<const std::string &, const std::string &,
|
||||
const std::string &, float, float, float>(),
|
||||
const std::string &, const std::string, float, float,
|
||||
float>(),
|
||||
py::arg("model"), py::arg("lexicon"), py::arg("tokens"),
|
||||
py::arg("noise_scale") = 0.667, py::arg("noise_scale_w") = 0.8,
|
||||
py::arg("length_scale") = 1.0)
|
||||
py::arg("data_dir") = "", py::arg("noise_scale") = 0.667,
|
||||
py::arg("noise_scale_w") = 0.8, py::arg("length_scale") = 1.0)
|
||||
.def_readwrite("model", &PyClass::model)
|
||||
.def_readwrite("lexicon", &PyClass::lexicon)
|
||||
.def_readwrite("tokens", &PyClass::tokens)
|
||||
.def_readwrite("data_dir", &PyClass::data_dir)
|
||||
.def_readwrite("noise_scale", &PyClass::noise_scale)
|
||||
.def_readwrite("noise_scale_w", &PyClass::noise_scale_w)
|
||||
.def_readwrite("length_scale", &PyClass::length_scale)
|
||||
.def("__str__", &PyClass::ToString);
|
||||
.def("__str__", &PyClass::ToString)
|
||||
.def("validate", &PyClass::Validate);
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -30,10 +30,14 @@ static void PybindOfflineTtsConfig(py::module *m) {
|
||||
using PyClass = OfflineTtsConfig;
|
||||
py::class_<PyClass>(*m, "OfflineTtsConfig")
|
||||
.def(py::init<>())
|
||||
.def(py::init<const OfflineTtsModelConfig &, const std::string &>(),
|
||||
py::arg("model"), py::arg("rule_fsts") = "")
|
||||
.def(py::init<const OfflineTtsModelConfig &, const std::string &,
|
||||
int32_t>(),
|
||||
py::arg("model"), py::arg("rule_fsts") = "",
|
||||
py::arg("max_num_sentences") = 2)
|
||||
.def_readwrite("model", &PyClass::model)
|
||||
.def_readwrite("rule_fsts", &PyClass::rule_fsts)
|
||||
.def_readwrite("max_num_sentences", &PyClass::max_num_sentences)
|
||||
.def("validate", &PyClass::Validate)
|
||||
.def("__str__", &PyClass::ToString);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user