update README

This commit is contained in:
2025-09-10 10:47:02 +08:00
parent 5088f0b50a
commit ff78032400
603 changed files with 21 additions and 23 deletions

View File

@@ -0,0 +1,96 @@
#!/usr/bin/env python3
import argparse
import logging
import json
import time
import statistics
import sys
import torch
_LOGGER = logging.getLogger(__name__)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"-m", "--model", required=True, help="Path to generator file (.pt)"
)
parser.add_argument("-c", "--config", help="Path to model config file (.json)")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG)
if not args.config:
args.config = f"{args.model}.json"
with open(args.config, "r", encoding="utf-8") as config_file:
config = json.load(config_file)
sample_rate = config["audio"]["sample_rate"]
utterances = [json.loads(line) for line in sys.stdin]
start_time = time.monotonic_ns()
model = torch.load(args.model)
end_time = time.monotonic_ns()
model.eval()
load_sec = (end_time - start_time) / 1e9
synthesize_rtf = []
for utterance in utterances:
phoneme_ids = utterance["phoneme_ids"]
speaker_id = utterance.get("speaker_id")
synthesize_rtf.append(
synthesize(
model,
phoneme_ids,
speaker_id,
sample_rate,
)
)
json.dump(
{
"load_sec": load_sec,
"rtf_mean": statistics.mean(synthesize_rtf),
"rtf_stdev": statistics.stdev(synthesize_rtf),
"synthesize_rtf": synthesize_rtf,
},
sys.stdout,
)
def synthesize(model, phoneme_ids, speaker_id, sample_rate) -> float:
text = torch.LongTensor(phoneme_ids).unsqueeze(0)
text_lengths = torch.LongTensor([len(phoneme_ids)])
sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
start_time = time.monotonic_ns()
audio = (
model(
text,
text_lengths,
sid,
)[0]
.detach()
.numpy()
.squeeze()
)
end_time = time.monotonic_ns()
audio_sec = len(audio) / sample_rate
infer_sec = (end_time - start_time) / 1e9
rtf = infer_sec / audio_sec
_LOGGER.debug(
"Real-time factor: %s (infer=%s sec, audio=%s sec)",
rtf,
infer_sec,
audio_sec,
)
return rtf
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,125 @@
#!/usr/bin/env python3
import argparse
import logging
import json
import time
import statistics
import sys
import onnxruntime
import numpy as np
_NOISE_SCALE = 0.667
_LENGTH_SCALE = 1.0
_NOISE_W = 0.8
_LOGGER = logging.getLogger(__name__)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"-m", "--model", required=True, help="Path to Onnx model file (.onnx)"
)
parser.add_argument("-c", "--config", help="Path to model config file (.json)")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG)
if not args.config:
args.config = f"{args.model}.json"
with open(args.config, "r", encoding="utf-8") as config_file:
config = json.load(config_file)
sample_rate = config["audio"]["sample_rate"]
utterances = [json.loads(line) for line in sys.stdin]
start_time = time.monotonic_ns()
session_options = onnxruntime.SessionOptions()
session_options.graph_optimization_level = (
onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
)
# session_options.enable_cpu_mem_arena = False
# session_options.enable_mem_pattern = False
session_options.enable_mem_reuse = False
# session_options.enable_profiling = False
# session_options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL
# session_options.execution_order = onnxruntime.ExecutionOrder.PRIORITY_BASED
session = onnxruntime.InferenceSession(
args.model,
sess_options=session_options,
)
# session.intra_op_num_threads = 1
# session.inter_op_num_threads = 1
end_time = time.monotonic_ns()
load_sec = (end_time - start_time) / 1e9
synthesize_rtf = []
for utterance in utterances:
phoneme_ids = utterance["phoneme_ids"]
speaker_id = utterance.get("speaker_id")
synthesize_rtf.append(
synthesize(
session,
phoneme_ids,
speaker_id,
sample_rate,
)
)
json.dump(
{
"load_sec": load_sec,
"rtf_mean": statistics.mean(synthesize_rtf),
"rtf_stdev": statistics.stdev(synthesize_rtf),
"rtfs": synthesize_rtf,
},
sys.stdout,
)
def synthesize(session, phoneme_ids, speaker_id, sample_rate) -> float:
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
scales = np.array(
[_NOISE_SCALE, _LENGTH_SCALE, _NOISE_W],
dtype=np.float32,
)
sid = None
if speaker_id is not None:
sid = np.array([speaker_id], dtype=np.int64)
# Synthesize through Onnx
start_time = time.monotonic_ns()
audio = session.run(
None,
{
"input": phoneme_ids_array,
"input_lengths": phoneme_ids_lengths,
"scales": scales,
"sid": sid,
},
)[0].squeeze()
end_time = time.monotonic_ns()
audio_sec = len(audio) / sample_rate
infer_sec = (end_time - start_time) / 1e9
rtf = infer_sec / audio_sec
_LOGGER.debug(
"Real-time factor: %s (infer=%s sec, audio=%s sec)",
rtf,
infer_sec,
audio_sec,
)
return rtf
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,103 @@
#!/usr/bin/env python3
import argparse
import logging
import json
import time
import statistics
import sys
import torch
_NOISE_SCALE = 0.667
_LENGTH_SCALE = 1.0
_NOISE_W = 0.8
_LOGGER = logging.getLogger(__name__)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"-m", "--model", required=True, help="Path to Torchscript file (.ts)"
)
parser.add_argument("-c", "--config", help="Path to model config file (.json)")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG)
if not args.config:
args.config = f"{args.model}.json"
with open(args.config, "r", encoding="utf-8") as config_file:
config = json.load(config_file)
sample_rate = config["audio"]["sample_rate"]
utterances = [json.loads(line) for line in sys.stdin]
start_time = time.monotonic_ns()
model = torch.jit.load(args.model)
end_time = time.monotonic_ns()
model.eval()
load_sec = (end_time - start_time) / 1e9
synthesize_rtf = []
for utterance in utterances:
phoneme_ids = utterance["phoneme_ids"]
speaker_id = utterance.get("speaker_id")
synthesize_rtf.append(
synthesize(
model,
phoneme_ids,
speaker_id,
sample_rate,
)
)
json.dump(
{
"load_sec": load_sec,
"rtf_mean": statistics.mean(synthesize_rtf),
"rtf_stdev": statistics.stdev(synthesize_rtf),
"synthesize_rtf": synthesize_rtf,
},
sys.stdout,
)
def synthesize(model, phoneme_ids, speaker_id, sample_rate) -> float:
text = torch.LongTensor(phoneme_ids).unsqueeze(0)
text_lengths = torch.LongTensor([len(phoneme_ids)])
sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
start_time = time.monotonic_ns()
audio = (
model(
text,
text_lengths,
sid,
torch.FloatTensor([_NOISE_SCALE]),
torch.FloatTensor([_LENGTH_SCALE]),
torch.FloatTensor([_NOISE_W]),
)[0]
.detach()
.numpy()
.squeeze()
)
end_time = time.monotonic_ns()
audio_sec = len(audio) / sample_rate
infer_sec = (end_time - start_time) / 1e9
rtf = infer_sec / audio_sec
_LOGGER.debug(
"Real-time factor: %s (infer=%s sec, audio=%s sec)",
rtf,
infer_sec,
audio_sec,
)
return rtf
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,2 @@
onnxruntime~=1.11.0
torch~=1.11.0

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,561 @@
#include <chrono>
#include <condition_variable>
#include <filesystem>
#include <fstream>
#include <functional>
#include <iostream>
#include <map>
#include <mutex>
#include <sstream>
#include <stdexcept>
#include <string>
#include <thread>
#include <vector>
#ifdef _MSC_VER
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#endif
#ifdef _WIN32
#include <fcntl.h>
#include <io.h>
#endif
#ifdef __APPLE__
#include <mach-o/dyld.h>
#endif
#include <spdlog/sinks/stdout_color_sinks.h>
#include <spdlog/spdlog.h>
#include "json.hpp"
#include "piper.hpp"
using namespace std;
using json = nlohmann::json;
enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW };
struct RunConfig {
// Path to .onnx voice file
filesystem::path modelPath;
// Path to JSON voice config file
filesystem::path modelConfigPath;
// Type of output to produce.
// Default is to write a WAV file in the current directory.
OutputType outputType = OUTPUT_DIRECTORY;
// Path for output
optional<filesystem::path> outputPath = filesystem::path(".");
// Numerical id of the default speaker (multi-speaker voices)
optional<piper::SpeakerId> speakerId;
// Amount of noise to add during audio generation
optional<float> noiseScale;
// Speed of speaking (1 = normal, < 1 is faster, > 1 is slower)
optional<float> lengthScale;
// Variation in phoneme lengths
optional<float> noiseW;
// Seconds of silence to add after each sentence
optional<float> sentenceSilenceSeconds;
// Path to espeak-ng data directory (default is next to piper executable)
optional<filesystem::path> eSpeakDataPath;
// Path to libtashkeel ort model
// https://github.com/mush42/libtashkeel/
optional<filesystem::path> tashkeelModelPath;
// stdin input is lines of JSON instead of text with format:
// {
// "text": str, (required)
// "speaker_id": int, (optional)
// "speaker": str, (optional)
// "output_file": str, (optional)
// }
bool jsonInput = false;
// Seconds of extra silence to insert after a single phoneme
optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
// true to use CUDA execution provider
bool useCuda = false;
};
void parseArgs(int argc, char *argv[], RunConfig &runConfig);
void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
condition_variable &cvAudio, bool &audioReady,
bool &audioFinished);
// ----------------------------------------------------------------------------
int main(int argc, char *argv[]) {
spdlog::set_default_logger(spdlog::stderr_color_st("piper"));
RunConfig runConfig;
parseArgs(argc, argv, runConfig);
#ifdef _WIN32
// Required on Windows to show IPA symbols
SetConsoleOutputCP(CP_UTF8);
#endif
piper::PiperConfig piperConfig;
piper::Voice voice;
spdlog::debug("Loading voice from {} (config={})",
runConfig.modelPath.string(),
runConfig.modelConfigPath.string());
auto startTime = chrono::steady_clock::now();
loadVoice(piperConfig, runConfig.modelPath.string(),
runConfig.modelConfigPath.string(), voice, runConfig.speakerId,
runConfig.useCuda);
auto endTime = chrono::steady_clock::now();
spdlog::info("Loaded voice in {} second(s)",
chrono::duration<double>(endTime - startTime).count());
// Get the path to the piper executable so we can locate espeak-ng-data, etc.
// next to it.
#ifdef _MSC_VER
auto exePath = []() {
wchar_t moduleFileName[MAX_PATH] = {0};
GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
return filesystem::path(moduleFileName);
}();
#else
#ifdef __APPLE__
auto exePath = []() {
char moduleFileName[PATH_MAX] = {0};
uint32_t moduleFileNameSize = std::size(moduleFileName);
_NSGetExecutablePath(moduleFileName, &moduleFileNameSize);
return filesystem::path(moduleFileName);
}();
#else
auto exePath = filesystem::canonical("/proc/self/exe");
#endif
#endif
if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) {
spdlog::debug("Voice uses eSpeak phonemes ({})",
voice.phonemizeConfig.eSpeak.voice);
if (runConfig.eSpeakDataPath) {
// User provided path
piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string();
} else {
// Assume next to piper executable
piperConfig.eSpeakDataPath =
std::filesystem::absolute(
exePath.parent_path().append("espeak-ng-data"))
.string();
spdlog::debug("espeak-ng-data directory is expected at {}",
piperConfig.eSpeakDataPath);
}
} else {
// Not using eSpeak
piperConfig.useESpeak = false;
}
// Enable libtashkeel for Arabic
if (voice.phonemizeConfig.eSpeak.voice == "ar") {
piperConfig.useTashkeel = true;
if (runConfig.tashkeelModelPath) {
// User provided path
piperConfig.tashkeelModelPath =
runConfig.tashkeelModelPath.value().string();
} else {
// Assume next to piper executable
piperConfig.tashkeelModelPath =
std::filesystem::absolute(
exePath.parent_path().append("libtashkeel_model.ort"))
.string();
spdlog::debug("libtashkeel model is expected at {}",
piperConfig.tashkeelModelPath.value());
}
}
piper::initialize(piperConfig);
// Scales
if (runConfig.noiseScale) {
voice.synthesisConfig.noiseScale = runConfig.noiseScale.value();
}
if (runConfig.lengthScale) {
voice.synthesisConfig.lengthScale = runConfig.lengthScale.value();
}
if (runConfig.noiseW) {
voice.synthesisConfig.noiseW = runConfig.noiseW.value();
}
if (runConfig.sentenceSilenceSeconds) {
voice.synthesisConfig.sentenceSilenceSeconds =
runConfig.sentenceSilenceSeconds.value();
}
if (runConfig.phonemeSilenceSeconds) {
if (!voice.synthesisConfig.phonemeSilenceSeconds) {
// Overwrite
voice.synthesisConfig.phonemeSilenceSeconds =
runConfig.phonemeSilenceSeconds;
} else {
// Merge
for (const auto &[phoneme, silenceSeconds] :
*runConfig.phonemeSilenceSeconds) {
voice.synthesisConfig.phonemeSilenceSeconds->try_emplace(
phoneme, silenceSeconds);
}
}
} // if phonemeSilenceSeconds
if (runConfig.outputType == OUTPUT_DIRECTORY) {
runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
spdlog::info("Output directory: {}", runConfig.outputPath.value().string());
}
string line;
piper::SynthesisResult result;
while (getline(cin, line)) {
auto outputType = runConfig.outputType;
auto speakerId = voice.synthesisConfig.speakerId;
std::optional<filesystem::path> maybeOutputPath = runConfig.outputPath;
if (runConfig.jsonInput) {
// Each line is a JSON object
json lineRoot = json::parse(line);
// Text is required
line = lineRoot["text"].get<std::string>();
if (lineRoot.contains("output_file")) {
// Override output WAV file path
outputType = OUTPUT_FILE;
maybeOutputPath =
filesystem::path(lineRoot["output_file"].get<std::string>());
}
if (lineRoot.contains("speaker_id")) {
// Override speaker id
voice.synthesisConfig.speakerId =
lineRoot["speaker_id"].get<piper::SpeakerId>();
} else if (lineRoot.contains("speaker")) {
// Resolve to id using speaker id map
auto speakerName = lineRoot["speaker"].get<std::string>();
if ((voice.modelConfig.speakerIdMap) &&
(voice.modelConfig.speakerIdMap->count(speakerName) > 0)) {
voice.synthesisConfig.speakerId =
(*voice.modelConfig.speakerIdMap)[speakerName];
} else {
spdlog::warn("No speaker named: {}", speakerName);
}
}
}
// Timestamp is used for path to output WAV file
const auto now = chrono::system_clock::now();
const auto timestamp =
chrono::duration_cast<chrono::nanoseconds>(now.time_since_epoch())
.count();
if (outputType == OUTPUT_DIRECTORY) {
// Generate path using timestamp
stringstream outputName;
outputName << timestamp << ".wav";
filesystem::path outputPath = runConfig.outputPath.value();
outputPath.append(outputName.str());
// Output audio to automatically-named WAV file in a directory
ofstream audioFile(outputPath.string(), ios::binary);
piper::textToWavFile(piperConfig, voice, line, audioFile, result);
cout << outputPath.string() << endl;
} else if (outputType == OUTPUT_FILE) {
if (!maybeOutputPath || maybeOutputPath->empty()) {
throw runtime_error("No output path provided");
}
filesystem::path outputPath = maybeOutputPath.value();
if (!runConfig.jsonInput) {
// Read all of standard input before synthesizing.
// Otherwise, we would overwrite the output file for each line.
stringstream text;
text << line;
while (getline(cin, line)) {
text << " " << line;
}
line = text.str();
}
// Output audio to WAV file
ofstream audioFile(outputPath.string(), ios::binary);
piper::textToWavFile(piperConfig, voice, line, audioFile, result);
cout << outputPath.string() << endl;
} else if (outputType == OUTPUT_STDOUT) {
// Output WAV to stdout
piper::textToWavFile(piperConfig, voice, line, cout, result);
} else if (outputType == OUTPUT_RAW) {
// Raw output to stdout
mutex mutAudio;
condition_variable cvAudio;
bool audioReady = false;
bool audioFinished = false;
vector<int16_t> audioBuffer;
vector<int16_t> sharedAudioBuffer;
#ifdef _WIN32
// Needed on Windows to avoid terminal conversions
setmode(fileno(stdout), O_BINARY);
setmode(fileno(stdin), O_BINARY);
#endif
thread rawOutputThread(rawOutputProc, ref(sharedAudioBuffer),
ref(mutAudio), ref(cvAudio), ref(audioReady),
ref(audioFinished));
auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio,
&cvAudio, &audioReady]() {
// Signal thread that audio is ready
{
unique_lock lockAudio(mutAudio);
copy(audioBuffer.begin(), audioBuffer.end(),
back_inserter(sharedAudioBuffer));
audioReady = true;
cvAudio.notify_one();
}
};
piper::textToAudio(piperConfig, voice, line, audioBuffer, result,
audioCallback);
// Signal thread that there is no more audio
{
unique_lock lockAudio(mutAudio);
audioReady = true;
audioFinished = true;
cvAudio.notify_one();
}
// Wait for audio output to finish
spdlog::info("Waiting for audio to finish playing...");
rawOutputThread.join();
}
spdlog::info("Real-time factor: {} (infer={} sec, audio={} sec)",
result.realTimeFactor, result.inferSeconds,
result.audioSeconds);
// Restore config (--json-input)
voice.synthesisConfig.speakerId = speakerId;
} // for each line
piper::terminate(piperConfig);
return EXIT_SUCCESS;
}
// ----------------------------------------------------------------------------
void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
condition_variable &cvAudio, bool &audioReady,
bool &audioFinished) {
vector<int16_t> internalAudioBuffer;
while (true) {
{
unique_lock lockAudio{mutAudio};
cvAudio.wait(lockAudio, [&audioReady] { return audioReady; });
if (sharedAudioBuffer.empty() && audioFinished) {
break;
}
copy(sharedAudioBuffer.begin(), sharedAudioBuffer.end(),
back_inserter(internalAudioBuffer));
sharedAudioBuffer.clear();
if (!audioFinished) {
audioReady = false;
}
}
cout.write((const char *)internalAudioBuffer.data(),
sizeof(int16_t) * internalAudioBuffer.size());
cout.flush();
internalAudioBuffer.clear();
}
} // rawOutputProc
// ----------------------------------------------------------------------------
void printUsage(char *argv[]) {
cerr << endl;
cerr << "usage: " << argv[0] << " [options]" << endl;
cerr << endl;
cerr << "options:" << endl;
cerr << " -h --help show this message and exit" << endl;
cerr << " -m FILE --model FILE path to onnx model file" << endl;
cerr << " -c FILE --config FILE path to model config file "
"(default: model path + .json)"
<< endl;
cerr << " -f FILE --output_file FILE path to output WAV file ('-' for "
"stdout)"
<< endl;
cerr << " -d DIR --output_dir DIR path to output directory (default: "
"cwd)"
<< endl;
cerr << " --output_raw output raw audio to stdout as it "
"becomes available"
<< endl;
cerr << " -s NUM --speaker NUM id of speaker (default: 0)" << endl;
cerr << " --noise_scale NUM generator noise (default: 0.667)"
<< endl;
cerr << " --length_scale NUM phoneme length (default: 1.0)"
<< endl;
cerr << " --noise_w NUM phoneme width noise (default: 0.8)"
<< endl;
cerr << " --sentence_silence NUM seconds of silence after each "
"sentence (default: 0.2)"
<< endl;
cerr << " --espeak_data DIR path to espeak-ng data directory"
<< endl;
cerr << " --tashkeel_model FILE path to libtashkeel onnx model "
"(arabic)"
<< endl;
cerr << " --json-input stdin input is lines of JSON "
"instead of plain text"
<< endl;
cerr << " --use-cuda use CUDA execution provider"
<< endl;
cerr << " --debug print DEBUG messages to the console"
<< endl;
cerr << " -q --quiet disable logging" << endl;
cerr << endl;
}
void ensureArg(int argc, char *argv[], int argi) {
if ((argi + 1) >= argc) {
printUsage(argv);
exit(0);
}
}
// Parse command-line arguments
void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
optional<filesystem::path> modelConfigPath;
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-m" || arg == "--model") {
ensureArg(argc, argv, i);
runConfig.modelPath = filesystem::path(argv[++i]);
} else if (arg == "-c" || arg == "--config") {
ensureArg(argc, argv, i);
modelConfigPath = filesystem::path(argv[++i]);
} else if (arg == "-f" || arg == "--output_file" ||
arg == "--output-file") {
ensureArg(argc, argv, i);
std::string filePath = argv[++i];
if (filePath == "-") {
runConfig.outputType = OUTPUT_STDOUT;
runConfig.outputPath = nullopt;
} else {
runConfig.outputType = OUTPUT_FILE;
runConfig.outputPath = filesystem::path(filePath);
}
} else if (arg == "-d" || arg == "--output_dir" || arg == "output-dir") {
ensureArg(argc, argv, i);
runConfig.outputType = OUTPUT_DIRECTORY;
runConfig.outputPath = filesystem::path(argv[++i]);
} else if (arg == "--output_raw" || arg == "--output-raw") {
runConfig.outputType = OUTPUT_RAW;
} else if (arg == "-s" || arg == "--speaker") {
ensureArg(argc, argv, i);
runConfig.speakerId = (piper::SpeakerId)stol(argv[++i]);
} else if (arg == "--noise_scale" || arg == "--noise-scale") {
ensureArg(argc, argv, i);
runConfig.noiseScale = stof(argv[++i]);
} else if (arg == "--length_scale" || arg == "--length-scale") {
ensureArg(argc, argv, i);
runConfig.lengthScale = stof(argv[++i]);
} else if (arg == "--noise_w" || arg == "--noise-w") {
ensureArg(argc, argv, i);
runConfig.noiseW = stof(argv[++i]);
} else if (arg == "--sentence_silence" || arg == "--sentence-silence") {
ensureArg(argc, argv, i);
runConfig.sentenceSilenceSeconds = stof(argv[++i]);
} else if (arg == "--phoneme_silence" || arg == "--phoneme-silence") {
ensureArg(argc, argv, i);
ensureArg(argc, argv, i + 1);
auto phonemeStr = std::string(argv[++i]);
if (!piper::isSingleCodepoint(phonemeStr)) {
std::cerr << "Phoneme '" << phonemeStr
<< "' is not a single codepoint (--phoneme_silence)"
<< std::endl;
exit(1);
}
if (!runConfig.phonemeSilenceSeconds) {
runConfig.phonemeSilenceSeconds.emplace();
}
auto phoneme = piper::getCodepoint(phonemeStr);
(*runConfig.phonemeSilenceSeconds)[phoneme] = stof(argv[++i]);
} else if (arg == "--espeak_data" || arg == "--espeak-data") {
ensureArg(argc, argv, i);
runConfig.eSpeakDataPath = filesystem::path(argv[++i]);
} else if (arg == "--tashkeel_model" || arg == "--tashkeel-model") {
ensureArg(argc, argv, i);
runConfig.tashkeelModelPath = filesystem::path(argv[++i]);
} else if (arg == "--json_input" || arg == "--json-input") {
runConfig.jsonInput = true;
} else if (arg == "--use_cuda" || arg == "--use-cuda") {
runConfig.useCuda = true;
} else if (arg == "--version") {
std::cout << piper::getVersion() << std::endl;
exit(0);
} else if (arg == "--debug") {
// Set DEBUG logging
spdlog::set_level(spdlog::level::debug);
} else if (arg == "-q" || arg == "--quiet") {
// diable logging
spdlog::set_level(spdlog::level::off);
} else if (arg == "-h" || arg == "--help") {
printUsage(argv);
exit(0);
}
}
// Verify model file exists
ifstream modelFile(runConfig.modelPath.c_str(), ios::binary);
if (!modelFile.good()) {
throw runtime_error("Model file doesn't exist");
}
if (!modelConfigPath) {
runConfig.modelConfigPath =
filesystem::path(runConfig.modelPath.string() + ".json");
} else {
runConfig.modelConfigPath = modelConfigPath.value();
}
// Verify model config exists
ifstream modelConfigFile(runConfig.modelConfigPath.c_str());
if (!modelConfigFile.good()) {
throw runtime_error("Model config doesn't exist");
}
}

View File

@@ -0,0 +1,636 @@
#include <array>
#include <chrono>
#include <fstream>
#include <limits>
#include <sstream>
#include <stdexcept>
#include <espeak-ng/speak_lib.h>
#include <onnxruntime_cxx_api.h>
#include <spdlog/spdlog.h>
#include "json.hpp"
#include "piper.hpp"
#include "utf8.h"
#include "wavfile.hpp"
namespace piper {
#ifdef _PIPER_VERSION
// https://stackoverflow.com/questions/47346133/how-to-use-a-define-inside-a-format-string
#define _STR(x) #x
#define STR(x) _STR(x)
const std::string VERSION = STR(_PIPER_VERSION);
#else
const std::string VERSION = "";
#endif
// Maximum value for 16-bit signed WAV sample
const float MAX_WAV_VALUE = 32767.0f;
const std::string instanceName{"piper"};
std::string getVersion() { return VERSION; }
// True if the string is a single UTF-8 codepoint
bool isSingleCodepoint(std::string s) {
return utf8::distance(s.begin(), s.end()) == 1;
}
// Get the first UTF-8 codepoint of a string
Phoneme getCodepoint(std::string s) {
utf8::iterator character_iter(s.begin(), s.begin(), s.end());
return *character_iter;
}
// Load JSON config information for phonemization
void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
// {
// "espeak": {
// "voice": "<language code>"
// },
// "phoneme_type": "<espeak or text>",
// "phoneme_map": {
// "<from phoneme>": ["<to phoneme 1>", "<to phoneme 2>", ...]
// },
// "phoneme_id_map": {
// "<phoneme>": [<id1>, <id2>, ...]
// }
// }
if (configRoot.contains("espeak")) {
auto espeakValue = configRoot["espeak"];
if (espeakValue.contains("voice")) {
phonemizeConfig.eSpeak.voice = espeakValue["voice"].get<std::string>();
}
}
if (configRoot.contains("phoneme_type")) {
auto phonemeTypeStr = configRoot["phoneme_type"].get<std::string>();
if (phonemeTypeStr == "text") {
phonemizeConfig.phonemeType = TextPhonemes;
}
}
// phoneme to [id] map
// Maps phonemes to one or more phoneme ids (required).
if (configRoot.contains("phoneme_id_map")) {
auto phonemeIdMapValue = configRoot["phoneme_id_map"];
for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
std::string fromPhoneme = fromPhonemeItem.key();
if (!isSingleCodepoint(fromPhoneme)) {
std::stringstream idsStr;
for (auto &toIdValue : fromPhonemeItem.value()) {
PhonemeId toId = toIdValue.get<PhonemeId>();
idsStr << toId << ",";
}
spdlog::error("\"{}\" is not a single codepoint (ids={})", fromPhoneme,
idsStr.str());
throw std::runtime_error(
"Phonemes must be one codepoint (phoneme id map)");
}
auto fromCodepoint = getCodepoint(fromPhoneme);
for (auto &toIdValue : fromPhonemeItem.value()) {
PhonemeId toId = toIdValue.get<PhonemeId>();
phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
}
}
}
// phoneme to [phoneme] map
// Maps phonemes to one or more other phonemes (not normally used).
if (configRoot.contains("phoneme_map")) {
if (!phonemizeConfig.phonemeMap) {
phonemizeConfig.phonemeMap.emplace();
}
auto phonemeMapValue = configRoot["phoneme_map"];
for (auto &fromPhonemeItem : phonemeMapValue.items()) {
std::string fromPhoneme = fromPhonemeItem.key();
if (!isSingleCodepoint(fromPhoneme)) {
spdlog::error("\"{}\" is not a single codepoint", fromPhoneme);
throw std::runtime_error(
"Phonemes must be one codepoint (phoneme map)");
}
auto fromCodepoint = getCodepoint(fromPhoneme);
for (auto &toPhonemeValue : fromPhonemeItem.value()) {
std::string toPhoneme = toPhonemeValue.get<std::string>();
if (!isSingleCodepoint(toPhoneme)) {
throw std::runtime_error(
"Phonemes must be one codepoint (phoneme map)");
}
auto toCodepoint = getCodepoint(toPhoneme);
(*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
}
}
}
} /* parsePhonemizeConfig */
// Load JSON config for audio synthesis
void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
// {
// "audio": {
// "sample_rate": 22050
// },
// "inference": {
// "noise_scale": 0.667,
// "length_scale": 1,
// "noise_w": 0.8,
// "phoneme_silence": {
// "<phoneme>": <seconds of silence>,
// ...
// }
// }
// }
if (configRoot.contains("audio")) {
auto audioValue = configRoot["audio"];
if (audioValue.contains("sample_rate")) {
// Default sample rate is 22050 Hz
synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
}
}
if (configRoot.contains("inference")) {
// Overrides default inference settings
auto inferenceValue = configRoot["inference"];
if (inferenceValue.contains("noise_scale")) {
synthesisConfig.noiseScale = inferenceValue.value("noise_scale", 0.667f);
}
if (inferenceValue.contains("length_scale")) {
synthesisConfig.lengthScale = inferenceValue.value("length_scale", 1.0f);
}
if (inferenceValue.contains("noise_w")) {
synthesisConfig.noiseW = inferenceValue.value("noise_w", 0.8f);
}
if (inferenceValue.contains("phoneme_silence")) {
// phoneme -> seconds of silence to add after
synthesisConfig.phonemeSilenceSeconds.emplace();
auto phonemeSilenceValue = inferenceValue["phoneme_silence"];
for (auto &phonemeItem : phonemeSilenceValue.items()) {
std::string phonemeStr = phonemeItem.key();
if (!isSingleCodepoint(phonemeStr)) {
spdlog::error("\"{}\" is not a single codepoint", phonemeStr);
throw std::runtime_error(
"Phonemes must be one codepoint (phoneme silence)");
}
auto phoneme = getCodepoint(phonemeStr);
(*synthesisConfig.phonemeSilenceSeconds)[phoneme] =
phonemeItem.value().get<float>();
}
} // if phoneme_silence
} // if inference
} /* parseSynthesisConfig */
void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
if (configRoot.contains("speaker_id_map")) {
if (!modelConfig.speakerIdMap) {
modelConfig.speakerIdMap.emplace();
}
auto speakerIdMapValue = configRoot["speaker_id_map"];
for (auto &speakerItem : speakerIdMapValue.items()) {
std::string speakerName = speakerItem.key();
(*modelConfig.speakerIdMap)[speakerName] =
speakerItem.value().get<SpeakerId>();
}
}
} /* parseModelConfig */
void initialize(PiperConfig &config) {
if (config.useESpeak) {
// Set up espeak-ng for calling espeak_TextToPhonemesWithTerminator
// See: https://github.com/rhasspy/espeak-ng
spdlog::debug("Initializing eSpeak");
int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
/*buflength*/ 0,
/*path*/ config.eSpeakDataPath.c_str(),
/*options*/ 0);
if (result < 0) {
throw std::runtime_error("Failed to initialize eSpeak-ng");
}
spdlog::debug("Initialized eSpeak");
}
// Load onnx model for libtashkeel
// https://github.com/mush42/libtashkeel/
if (config.useTashkeel) {
spdlog::debug("Using libtashkeel for diacritization");
if (!config.tashkeelModelPath) {
throw std::runtime_error("No path to libtashkeel model");
}
spdlog::debug("Loading libtashkeel model from {}",
config.tashkeelModelPath.value());
config.tashkeelState = std::make_unique<tashkeel::State>();
tashkeel::tashkeel_load(config.tashkeelModelPath.value(),
*config.tashkeelState);
spdlog::debug("Initialized libtashkeel");
}
spdlog::info("Initialized piper");
}
void terminate(PiperConfig &config) {
if (config.useESpeak) {
// Clean up espeak-ng
spdlog::debug("Terminating eSpeak");
espeak_Terminate();
spdlog::debug("Terminated eSpeak");
}
spdlog::info("Terminated piper");
}
void loadModel(std::string modelPath, ModelSession &session, bool useCuda) {
spdlog::debug("Loading onnx model from {}", modelPath);
session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
instanceName.c_str());
session.env.DisableTelemetryEvents();
if (useCuda) {
// Use CUDA provider
OrtCUDAProviderOptions cuda_options{};
cuda_options.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchHeuristic;
session.options.AppendExecutionProvider_CUDA(cuda_options);
}
// Slows down performance by ~2x
// session.options.SetIntraOpNumThreads(1);
// Roughly doubles load time for no visible inference benefit
// session.options.SetGraphOptimizationLevel(
// GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
session.options.SetGraphOptimizationLevel(
GraphOptimizationLevel::ORT_DISABLE_ALL);
// Slows down performance very slightly
// session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
session.options.DisableCpuMemArena();
session.options.DisableMemPattern();
session.options.DisableProfiling();
auto startTime = std::chrono::steady_clock::now();
#ifdef _WIN32
auto modelPathW = std::wstring(modelPath.begin(), modelPath.end());
auto modelPathStr = modelPathW.c_str();
#else
auto modelPathStr = modelPath.c_str();
#endif
session.onnx = Ort::Session(session.env, modelPathStr, session.options);
auto endTime = std::chrono::steady_clock::now();
spdlog::debug("Loaded onnx model in {} second(s)",
std::chrono::duration<double>(endTime - startTime).count());
}
// Load Onnx model and JSON config file
void loadVoice(PiperConfig &config, std::string modelPath,
std::string modelConfigPath, Voice &voice,
std::optional<SpeakerId> &speakerId, bool useCuda) {
spdlog::debug("Parsing voice config at {}", modelConfigPath);
std::ifstream modelConfigFile(modelConfigPath);
voice.configRoot = json::parse(modelConfigFile);
parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
parseModelConfig(voice.configRoot, voice.modelConfig);
if (voice.modelConfig.numSpeakers > 1) {
// Multi-speaker model
if (speakerId) {
voice.synthesisConfig.speakerId = speakerId;
} else {
// Default speaker
voice.synthesisConfig.speakerId = 0;
}
}
spdlog::debug("Voice contains {} speaker(s)", voice.modelConfig.numSpeakers);
loadModel(modelPath, voice.session, useCuda);
} /* loadVoice */
// Phoneme ids to WAV audio
void synthesize(std::vector<PhonemeId> &phonemeIds,
SynthesisConfig &synthesisConfig, ModelSession &session,
std::vector<int16_t> &audioBuffer, SynthesisResult &result) {
spdlog::debug("Synthesizing audio for {} phoneme id(s)", phonemeIds.size());
auto memoryInfo = Ort::MemoryInfo::CreateCpu(
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
// Allocate
std::vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
std::vector<float> scales{synthesisConfig.noiseScale,
synthesisConfig.lengthScale,
synthesisConfig.noiseW};
std::vector<Ort::Value> inputTensors;
std::vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
phonemeIdsShape.size()));
std::vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
std::vector<int64_t> scalesShape{(int64_t)scales.size()};
inputTensors.push_back(
Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
scalesShape.data(), scalesShape.size()));
// Add speaker id.
// NOTE: These must be kept outside the "if" below to avoid being deallocated.
std::vector<int64_t> speakerId{
(int64_t)synthesisConfig.speakerId.value_or(0)};
std::vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
if (synthesisConfig.speakerId) {
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
speakerIdShape.size()));
}
// From export_onnx.py
std::array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
"sid"};
std::array<const char *, 1> outputNames = {"output"};
// Infer
auto startTime = std::chrono::steady_clock::now();
auto outputTensors = session.onnx.Run(
Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
inputTensors.size(), outputNames.data(), outputNames.size());
auto endTime = std::chrono::steady_clock::now();
if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
throw std::runtime_error("Invalid output tensors");
}
auto inferDuration = std::chrono::duration<double>(endTime - startTime);
result.inferSeconds = inferDuration.count();
const float *audio = outputTensors.front().GetTensorData<float>();
auto audioShape =
outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
int64_t audioCount = audioShape[audioShape.size() - 1];
result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
result.realTimeFactor = 0.0;
if (result.audioSeconds > 0) {
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
}
spdlog::debug("Synthesized {} second(s) of audio in {} second(s)",
result.audioSeconds, result.inferSeconds);
// Get max audio value for scaling
float maxAudioValue = 0.01f;
for (int64_t i = 0; i < audioCount; i++) {
float audioValue = abs(audio[i]);
if (audioValue > maxAudioValue) {
maxAudioValue = audioValue;
}
}
// We know the size up front
audioBuffer.reserve(audioCount);
// Scale audio to fill range and convert to int16
float audioScale = (MAX_WAV_VALUE / std::max(0.01f, maxAudioValue));
for (int64_t i = 0; i < audioCount; i++) {
int16_t intAudioValue = static_cast<int16_t>(
std::clamp(audio[i] * audioScale,
static_cast<float>(std::numeric_limits<int16_t>::min()),
static_cast<float>(std::numeric_limits<int16_t>::max())));
audioBuffer.push_back(intAudioValue);
}
// Clean up
for (std::size_t i = 0; i < outputTensors.size(); i++) {
Ort::detail::OrtRelease(outputTensors[i].release());
}
for (std::size_t i = 0; i < inputTensors.size(); i++) {
Ort::detail::OrtRelease(inputTensors[i].release());
}
}
// ----------------------------------------------------------------------------
// Phonemize text and synthesize audio
void textToAudio(PiperConfig &config, Voice &voice, std::string text,
std::vector<int16_t> &audioBuffer, SynthesisResult &result,
const std::function<void()> &audioCallback) {
std::size_t sentenceSilenceSamples = 0;
if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
sentenceSilenceSamples = (std::size_t)(
voice.synthesisConfig.sentenceSilenceSeconds *
voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
}
if (config.useTashkeel) {
if (!config.tashkeelState) {
throw std::runtime_error("Tashkeel model is not loaded");
}
spdlog::debug("Diacritizing text with libtashkeel: {}", text);
text = tashkeel::tashkeel_run(text, *config.tashkeelState);
}
// Phonemes for each sentence
spdlog::debug("Phonemizing text: {}", text);
std::vector<std::vector<Phoneme>> phonemes;
if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
// Use espeak-ng for phonemization
eSpeakPhonemeConfig eSpeakConfig;
eSpeakConfig.voice = voice.phonemizeConfig.eSpeak.voice;
phonemize_eSpeak(text, eSpeakConfig, phonemes);
} else {
// Use UTF-8 codepoints as "phonemes"
CodepointsPhonemeConfig codepointsConfig;
phonemize_codepoints(text, codepointsConfig, phonemes);
}
// Synthesize each sentence independently.
std::vector<PhonemeId> phonemeIds;
std::map<Phoneme, std::size_t> missingPhonemes;
for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
++phonemesIter) {
std::vector<Phoneme> &sentencePhonemes = *phonemesIter;
if (spdlog::should_log(spdlog::level::debug)) {
// DEBUG log for phonemes
std::string phonemesStr;
for (auto phoneme : sentencePhonemes) {
utf8::append(phoneme, std::back_inserter(phonemesStr));
}
spdlog::debug("Converting {} phoneme(s) to ids: {}",
sentencePhonemes.size(), phonemesStr);
}
std::vector<std::shared_ptr<std::vector<Phoneme>>> phrasePhonemes;
std::vector<SynthesisResult> phraseResults;
std::vector<size_t> phraseSilenceSamples;
// Use phoneme/id map from config
PhonemeIdConfig idConfig;
idConfig.phonemeIdMap =
std::make_shared<PhonemeIdMap>(voice.phonemizeConfig.phonemeIdMap);
if (voice.synthesisConfig.phonemeSilenceSeconds) {
// Split into phrases
std::map<Phoneme, float> &phonemeSilenceSeconds =
*voice.synthesisConfig.phonemeSilenceSeconds;
auto currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
phrasePhonemes.push_back(currentPhrasePhonemes);
for (auto sentencePhonemesIter = sentencePhonemes.begin();
sentencePhonemesIter != sentencePhonemes.end();
sentencePhonemesIter++) {
Phoneme &currentPhoneme = *sentencePhonemesIter;
currentPhrasePhonemes->push_back(currentPhoneme);
if (phonemeSilenceSeconds.count(currentPhoneme) > 0) {
// Split at phrase boundary
phraseSilenceSamples.push_back(
(std::size_t)(phonemeSilenceSeconds[currentPhoneme] *
voice.synthesisConfig.sampleRate *
voice.synthesisConfig.channels));
currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
phrasePhonemes.push_back(currentPhrasePhonemes);
}
}
} else {
// Use all phonemes
phrasePhonemes.push_back(
std::make_shared<std::vector<Phoneme>>(sentencePhonemes));
}
// Ensure results/samples are the same size
while (phraseResults.size() < phrasePhonemes.size()) {
phraseResults.emplace_back();
}
while (phraseSilenceSamples.size() < phrasePhonemes.size()) {
phraseSilenceSamples.push_back(0);
}
// phonemes -> ids -> audio
for (size_t phraseIdx = 0; phraseIdx < phrasePhonemes.size(); phraseIdx++) {
if (phrasePhonemes[phraseIdx]->size() <= 0) {
continue;
}
// phonemes -> ids
phonemes_to_ids(*(phrasePhonemes[phraseIdx]), idConfig, phonemeIds,
missingPhonemes);
if (spdlog::should_log(spdlog::level::debug)) {
// DEBUG log for phoneme ids
std::stringstream phonemeIdsStr;
for (auto phonemeId : phonemeIds) {
phonemeIdsStr << phonemeId << ", ";
}
spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
phrasePhonemes[phraseIdx]->size(), phonemeIds.size(),
phonemeIdsStr.str());
}
// ids -> audio
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
phraseResults[phraseIdx]);
// Add end of phrase silence
for (std::size_t i = 0; i < phraseSilenceSamples[phraseIdx]; i++) {
audioBuffer.push_back(0);
}
result.audioSeconds += phraseResults[phraseIdx].audioSeconds;
result.inferSeconds += phraseResults[phraseIdx].inferSeconds;
phonemeIds.clear();
}
// Add end of sentence silence
if (sentenceSilenceSamples > 0) {
for (std::size_t i = 0; i < sentenceSilenceSamples; i++) {
audioBuffer.push_back(0);
}
}
if (audioCallback) {
// Call back must copy audio since it is cleared afterwards.
audioCallback();
audioBuffer.clear();
}
phonemeIds.clear();
}
if (missingPhonemes.size() > 0) {
spdlog::warn("Missing {} phoneme(s) from phoneme/id map!",
missingPhonemes.size());
for (auto phonemeCount : missingPhonemes) {
std::string phonemeStr;
utf8::append(phonemeCount.first, std::back_inserter(phonemeStr));
spdlog::warn("Missing \"{}\" (\\u{:04X}): {} time(s)", phonemeStr,
(uint32_t)phonemeCount.first, phonemeCount.second);
}
}
if (result.audioSeconds > 0) {
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
}
} /* textToAudio */
// Phonemize text and synthesize audio to WAV file
void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
std::ostream &audioFile, SynthesisResult &result) {
std::vector<int16_t> audioBuffer;
textToAudio(config, voice, text, audioBuffer, result, NULL);
// Write WAV
auto synthesisConfig = voice.synthesisConfig;
writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
synthesisConfig.channels, (int32_t)audioBuffer.size(),
audioFile);
audioFile.write((const char *)audioBuffer.data(),
sizeof(int16_t) * audioBuffer.size());
} /* textToWavFile */
} // namespace piper

View File

@@ -0,0 +1,132 @@
#ifndef PIPER_H_
#define PIPER_H_
#include <fstream>
#include <functional>
#include <map>
#include <optional>
#include <string>
#include <vector>
#include <onnxruntime_cxx_api.h>
#include <piper-phonemize/phoneme_ids.hpp>
#include <piper-phonemize/phonemize.hpp>
#include <piper-phonemize/tashkeel.hpp>
#include "json.hpp"
using json = nlohmann::json;
namespace piper {
typedef int64_t SpeakerId;
struct eSpeakConfig {
std::string voice = "en-us";
};
struct PiperConfig {
std::string eSpeakDataPath;
bool useESpeak = true;
bool useTashkeel = false;
std::optional<std::string> tashkeelModelPath;
std::unique_ptr<tashkeel::State> tashkeelState;
};
enum PhonemeType { eSpeakPhonemes, TextPhonemes };
struct PhonemizeConfig {
PhonemeType phonemeType = eSpeakPhonemes;
std::optional<std::map<Phoneme, std::vector<Phoneme>>> phonemeMap;
std::map<Phoneme, std::vector<PhonemeId>> phonemeIdMap;
PhonemeId idPad = 0; // padding (optionally interspersed)
PhonemeId idBos = 1; // beginning of sentence
PhonemeId idEos = 2; // end of sentence
bool interspersePad = true;
eSpeakConfig eSpeak;
};
struct SynthesisConfig {
// VITS inference settings
float noiseScale = 0.667f;
float lengthScale = 1.0f;
float noiseW = 0.8f;
// Audio settings
int sampleRate = 22050;
int sampleWidth = 2; // 16-bit
int channels = 1; // mono
// Speaker id from 0 to numSpeakers - 1
std::optional<SpeakerId> speakerId;
// Extra silence
float sentenceSilenceSeconds = 0.2f;
std::optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
};
struct ModelConfig {
int numSpeakers;
// speaker name -> id
std::optional<std::map<std::string, SpeakerId>> speakerIdMap;
};
struct ModelSession {
Ort::Session onnx;
Ort::AllocatorWithDefaultOptions allocator;
Ort::SessionOptions options;
Ort::Env env;
ModelSession() : onnx(nullptr){};
};
struct SynthesisResult {
double inferSeconds;
double audioSeconds;
double realTimeFactor;
};
struct Voice {
json configRoot;
PhonemizeConfig phonemizeConfig;
SynthesisConfig synthesisConfig;
ModelConfig modelConfig;
ModelSession session;
};
// True if the string is a single UTF-8 codepoint
bool isSingleCodepoint(std::string s);
// Get the first UTF-8 codepoint of a string
Phoneme getCodepoint(std::string s);
// Get version of Piper
std::string getVersion();
// Must be called before using textTo* functions
void initialize(PiperConfig &config);
// Clean up
void terminate(PiperConfig &config);
// Load Onnx model and JSON config file
void loadVoice(PiperConfig &config, std::string modelPath,
std::string modelConfigPath, Voice &voice,
std::optional<SpeakerId> &speakerId, bool useCuda);
// Phonemize text and synthesize audio
void textToAudio(PiperConfig &config, Voice &voice, std::string text,
std::vector<int16_t> &audioBuffer, SynthesisResult &result,
const std::function<void()> &audioCallback);
// Phonemize text and synthesize audio to WAV file
void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
std::ostream &audioFile, SynthesisResult &result);
} // namespace piper
#endif // PIPER_H_

View File

@@ -0,0 +1,60 @@
#include <fstream>
#include <functional>
#include <iostream>
#include <optional>
#include <stdexcept>
#include <string>
#include <vector>
#include "json.hpp"
#include "piper.hpp"
using namespace std;
using json = nlohmann::json;
int main(int argc, char *argv[]) {
piper::PiperConfig piperConfig;
piper::Voice voice;
if (argc < 2) {
std::cerr << "Need voice model path" << std::endl;
return 1;
}
if (argc < 3) {
std::cerr << "Need espeak-ng-data path" << std::endl;
return 1;
}
if (argc < 4) {
std::cerr << "Need output WAV path" << std::endl;
return 1;
}
auto modelPath = std::string(argv[1]);
piperConfig.eSpeakDataPath = std::string(argv[2]);
auto outputPath = std::string(argv[3]);
optional<piper::SpeakerId> speakerId;
loadVoice(piperConfig, modelPath, modelPath + ".json", voice, speakerId,
false);
piper::initialize(piperConfig);
// Output audio to WAV file
ofstream audioFile(outputPath, ios::binary);
piper::SynthesisResult result;
piper::textToWavFile(piperConfig, voice, "This is a test.", audioFile,
result);
piper::terminate(piperConfig);
// Verify that file has some data
if (audioFile.tellp() < 10000) {
std::cerr << "ERROR: Output file is smaller than expected!" << std::endl;
return EXIT_FAILURE;
}
std::cout << "OK" << std::endl;
return EXIT_SUCCESS;
}

View File

@@ -0,0 +1,34 @@
// Copyright 2006 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include "utf8/checked.h"
#include "utf8/unchecked.h"
#endif // header guard

View File

@@ -0,0 +1,335 @@
// Copyright 2006-2016 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include "core.h"
#include <stdexcept>
namespace utf8
{
// Base for the exceptions that may be thrown from the library
class exception : public ::std::exception {
};
// Exceptions that may be thrown from the library functions.
class invalid_code_point : public exception {
uint32_t cp;
public:
invalid_code_point(uint32_t codepoint) : cp(codepoint) {}
virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; }
uint32_t code_point() const {return cp;}
};
class invalid_utf8 : public exception {
uint8_t u8;
public:
invalid_utf8 (uint8_t u) : u8(u) {}
virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; }
uint8_t utf8_octet() const {return u8;}
};
class invalid_utf16 : public exception {
uint16_t u16;
public:
invalid_utf16 (uint16_t u) : u16(u) {}
virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; }
uint16_t utf16_word() const {return u16;}
};
class not_enough_room : public exception {
public:
virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; }
};
/// The library API - functions intended to be called by the users
template <typename octet_iterator>
octet_iterator append(uint32_t cp, octet_iterator result)
{
if (!utf8::internal::is_code_point_valid(cp))
throw invalid_code_point(cp);
if (cp < 0x80) // one octet
*(result++) = static_cast<uint8_t>(cp);
else if (cp < 0x800) { // two octets
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else if (cp < 0x10000) { // three octets
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else { // four octets
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
return result;
}
template <typename octet_iterator, typename output_iterator>
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
{
while (start != end) {
octet_iterator sequence_start = start;
internal::utf_error err_code = utf8::internal::validate_next(start, end);
switch (err_code) {
case internal::UTF8_OK :
for (octet_iterator it = sequence_start; it != start; ++it)
*out++ = *it;
break;
case internal::NOT_ENOUGH_ROOM:
out = utf8::append (replacement, out);
start = end;
break;
case internal::INVALID_LEAD:
out = utf8::append (replacement, out);
++start;
break;
case internal::INCOMPLETE_SEQUENCE:
case internal::OVERLONG_SEQUENCE:
case internal::INVALID_CODE_POINT:
out = utf8::append (replacement, out);
++start;
// just one replacement mark for the sequence
while (start != end && utf8::internal::is_trail(*start))
++start;
break;
}
}
return out;
}
template <typename octet_iterator, typename output_iterator>
inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
{
static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
return utf8::replace_invalid(start, end, out, replacement_marker);
}
template <typename octet_iterator>
uint32_t next(octet_iterator& it, octet_iterator end)
{
uint32_t cp = 0;
internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
switch (err_code) {
case internal::UTF8_OK :
break;
case internal::NOT_ENOUGH_ROOM :
throw not_enough_room();
case internal::INVALID_LEAD :
case internal::INCOMPLETE_SEQUENCE :
case internal::OVERLONG_SEQUENCE :
throw invalid_utf8(*it);
case internal::INVALID_CODE_POINT :
throw invalid_code_point(cp);
}
return cp;
}
template <typename octet_iterator>
uint32_t peek_next(octet_iterator it, octet_iterator end)
{
return utf8::next(it, end);
}
template <typename octet_iterator>
uint32_t prior(octet_iterator& it, octet_iterator start)
{
// can't do much if it == start
if (it == start)
throw not_enough_room();
octet_iterator end = it;
// Go back until we hit either a lead octet or start
while (utf8::internal::is_trail(*(--it)))
if (it == start)
throw invalid_utf8(*it); // error - no lead byte in the sequence
return utf8::peek_next(it, end);
}
template <typename octet_iterator, typename distance_type>
void advance (octet_iterator& it, distance_type n, octet_iterator end)
{
const distance_type zero(0);
if (n < zero) {
// backward
for (distance_type i = n; i < zero; ++i)
utf8::prior(it, end);
} else {
// forward
for (distance_type i = zero; i < n; ++i)
utf8::next(it, end);
}
}
template <typename octet_iterator>
typename std::iterator_traits<octet_iterator>::difference_type
distance (octet_iterator first, octet_iterator last)
{
typename std::iterator_traits<octet_iterator>::difference_type dist;
for (dist = 0; first < last; ++dist)
utf8::next(first, last);
return dist;
}
template <typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
while (start != end) {
uint32_t cp = utf8::internal::mask16(*start++);
// Take care of surrogate pairs first
if (utf8::internal::is_lead_surrogate(cp)) {
if (start != end) {
uint32_t trail_surrogate = utf8::internal::mask16(*start++);
if (utf8::internal::is_trail_surrogate(trail_surrogate))
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
else
throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
}
else
throw invalid_utf16(static_cast<uint16_t>(cp));
}
// Lone trail surrogate
else if (utf8::internal::is_trail_surrogate(cp))
throw invalid_utf16(static_cast<uint16_t>(cp));
result = utf8::append(cp, result);
}
return result;
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
{
while (start < end) {
uint32_t cp = utf8::next(start, end);
if (cp > 0xffff) { //make a surrogate pair
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
}
else
*result++ = static_cast<uint16_t>(cp);
}
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
{
while (start != end)
result = utf8::append(*(start++), result);
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
{
while (start < end)
(*result++) = utf8::next(start, end);
return result;
}
// The iterator class
template <typename octet_iterator>
class iterator {
octet_iterator it;
octet_iterator range_start;
octet_iterator range_end;
public:
typedef uint32_t value_type;
typedef uint32_t* pointer;
typedef uint32_t& reference;
typedef std::ptrdiff_t difference_type;
typedef std::bidirectional_iterator_tag iterator_category;
iterator () {}
explicit iterator (const octet_iterator& octet_it,
const octet_iterator& rangestart,
const octet_iterator& rangeend) :
it(octet_it), range_start(rangestart), range_end(rangeend)
{
if (it < range_start || it > range_end)
throw std::out_of_range("Invalid utf-8 iterator position");
}
// the default "big three" are OK
octet_iterator base () const { return it; }
uint32_t operator * () const
{
octet_iterator temp = it;
return utf8::next(temp, range_end);
}
bool operator == (const iterator& rhs) const
{
if (range_start != rhs.range_start || range_end != rhs.range_end)
throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
return (it == rhs.it);
}
bool operator != (const iterator& rhs) const
{
return !(operator == (rhs));
}
iterator& operator ++ ()
{
utf8::next(it, range_end);
return *this;
}
iterator operator ++ (int)
{
iterator temp = *this;
utf8::next(it, range_end);
return temp;
}
iterator& operator -- ()
{
utf8::prior(it, range_start);
return *this;
}
iterator operator -- (int)
{
iterator temp = *this;
utf8::prior(it, range_start);
return temp;
}
}; // class iterator
} // namespace utf8
#if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later
#include "cpp17.h"
#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
#include "cpp11.h"
#endif // C++ 11 or later
#endif //header guard

View File

@@ -0,0 +1,338 @@
// Copyright 2006 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include <iterator>
// Determine the C++ standard version.
// If the user defines UTF_CPP_CPLUSPLUS, use that.
// Otherwise, trust the unreliable predefined macro __cplusplus
#if !defined UTF_CPP_CPLUSPLUS
#define UTF_CPP_CPLUSPLUS __cplusplus
#endif
#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
#define UTF_CPP_OVERRIDE override
#define UTF_CPP_NOEXCEPT noexcept
#else // C++ 98/03
#define UTF_CPP_OVERRIDE
#define UTF_CPP_NOEXCEPT throw()
#endif // C++ 11 or later
namespace utf8
{
// The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
// You may need to change them to match your system.
// These typedefs have the same names as ones from cstdint, or boost/cstdint
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// Helper code - not intended to be directly called by the library users. May be changed at any time
namespace internal
{
// Unicode constants
// Leading (high) surrogates: 0xd800 - 0xdbff
// Trailing (low) surrogates: 0xdc00 - 0xdfff
const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
const uint16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10)
const uint32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
// Maximum valid value for a Unicode code point
const uint32_t CODE_POINT_MAX = 0x0010ffffu;
template<typename octet_type>
inline uint8_t mask8(octet_type oc)
{
return static_cast<uint8_t>(0xff & oc);
}
template<typename u16_type>
inline uint16_t mask16(u16_type oc)
{
return static_cast<uint16_t>(0xffff & oc);
}
template<typename octet_type>
inline bool is_trail(octet_type oc)
{
return ((utf8::internal::mask8(oc) >> 6) == 0x2);
}
template <typename u16>
inline bool is_lead_surrogate(u16 cp)
{
return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
}
template <typename u16>
inline bool is_trail_surrogate(u16 cp)
{
return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
}
template <typename u16>
inline bool is_surrogate(u16 cp)
{
return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
}
template <typename u32>
inline bool is_code_point_valid(u32 cp)
{
return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
}
template <typename octet_iterator>
inline typename std::iterator_traits<octet_iterator>::difference_type
sequence_length(octet_iterator lead_it)
{
uint8_t lead = utf8::internal::mask8(*lead_it);
if (lead < 0x80)
return 1;
else if ((lead >> 5) == 0x6)
return 2;
else if ((lead >> 4) == 0xe)
return 3;
else if ((lead >> 3) == 0x1e)
return 4;
else
return 0;
}
template <typename octet_difference_type>
inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
{
if (cp < 0x80) {
if (length != 1)
return true;
}
else if (cp < 0x800) {
if (length != 2)
return true;
}
else if (cp < 0x10000) {
if (length != 3)
return true;
}
return false;
}
enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
/// Helper for get_sequence_x
template <typename octet_iterator>
utf_error increase_safely(octet_iterator& it, octet_iterator end)
{
if (++it == end)
return NOT_ENOUGH_ROOM;
if (!utf8::internal::is_trail(*it))
return INCOMPLETE_SEQUENCE;
return UTF8_OK;
}
#define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
/// get_sequence_x functions decode utf-8 sequences of the length x
template <typename octet_iterator>
utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
return UTF8_OK;
}
template <typename octet_iterator>
utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
return UTF8_OK;
}
template <typename octet_iterator>
utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point += (*it) & 0x3f;
return UTF8_OK;
}
template <typename octet_iterator>
utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point += (*it) & 0x3f;
return UTF8_OK;
}
#undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
template <typename octet_iterator>
utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
// Save the original value of it so we can go back in case of failure
// Of course, it does not make much sense with i.e. stream iterators
octet_iterator original_it = it;
uint32_t cp = 0;
// Determine the sequence length based on the lead octet
typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
const octet_difference_type length = utf8::internal::sequence_length(it);
// Get trail octets and calculate the code point
utf_error err = UTF8_OK;
switch (length) {
case 0:
return INVALID_LEAD;
case 1:
err = utf8::internal::get_sequence_1(it, end, cp);
break;
case 2:
err = utf8::internal::get_sequence_2(it, end, cp);
break;
case 3:
err = utf8::internal::get_sequence_3(it, end, cp);
break;
case 4:
err = utf8::internal::get_sequence_4(it, end, cp);
break;
}
if (err == UTF8_OK) {
// Decoding succeeded. Now, security checks...
if (utf8::internal::is_code_point_valid(cp)) {
if (!utf8::internal::is_overlong_sequence(cp, length)){
// Passed! Return here.
code_point = cp;
++it;
return UTF8_OK;
}
else
err = OVERLONG_SEQUENCE;
}
else
err = INVALID_CODE_POINT;
}
// Failure branch - restore the original value of the iterator
it = original_it;
return err;
}
template <typename octet_iterator>
inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
uint32_t ignored;
return utf8::internal::validate_next(it, end, ignored);
}
} // namespace internal
/// The library API - functions intended to be called by the users
// Byte order mark
const uint8_t bom[] = {0xef, 0xbb, 0xbf};
template <typename octet_iterator>
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
{
octet_iterator result = start;
while (result != end) {
utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
if (err_code != internal::UTF8_OK)
return result;
}
return result;
}
template <typename octet_iterator>
inline bool is_valid(octet_iterator start, octet_iterator end)
{
return (utf8::find_invalid(start, end) == end);
}
template <typename octet_iterator>
inline bool starts_with_bom (octet_iterator it, octet_iterator end)
{
return (
((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
((it != end) && (utf8::internal::mask8(*it)) == bom[2])
);
}
} // namespace utf8
#endif // header guard

View File

@@ -0,0 +1,103 @@
// Copyright 2018 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
#include "checked.h"
#include <string>
namespace utf8
{
inline void append(char32_t cp, std::string& s)
{
append(uint32_t(cp), std::back_inserter(s));
}
inline std::string utf16to8(const std::u16string& s)
{
std::string result;
utf16to8(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::u16string utf8to16(const std::string& s)
{
std::u16string result;
utf8to16(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::string utf32to8(const std::u32string& s)
{
std::string result;
utf32to8(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::u32string utf8to32(const std::string& s)
{
std::u32string result;
utf8to32(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::size_t find_invalid(const std::string& s)
{
std::string::const_iterator invalid = find_invalid(s.begin(), s.end());
return (invalid == s.end()) ? std::string::npos : (invalid - s.begin());
}
inline bool is_valid(const std::string& s)
{
return is_valid(s.begin(), s.end());
}
inline std::string replace_invalid(const std::string& s, char32_t replacement)
{
std::string result;
replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
return result;
}
inline std::string replace_invalid(const std::string& s)
{
std::string result;
replace_invalid(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline bool starts_with_bom(const std::string& s)
{
return starts_with_bom(s.begin(), s.end());
}
} // namespace utf8
#endif // header guard

View File

@@ -0,0 +1,103 @@
// Copyright 2018 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
#define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
#include "checked.h"
#include <string>
namespace utf8
{
inline void append(char32_t cp, std::string& s)
{
append(uint32_t(cp), std::back_inserter(s));
}
inline std::string utf16to8(std::u16string_view s)
{
std::string result;
utf16to8(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::u16string utf8to16(std::string_view s)
{
std::u16string result;
utf8to16(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::string utf32to8(std::u32string_view s)
{
std::string result;
utf32to8(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::u32string utf8to32(std::string_view s)
{
std::u32string result;
utf8to32(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::size_t find_invalid(std::string_view s)
{
std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end());
return (invalid == s.end()) ? std::string_view::npos : (invalid - s.begin());
}
inline bool is_valid(std::string_view s)
{
return is_valid(s.begin(), s.end());
}
inline std::string replace_invalid(std::string_view s, char32_t replacement)
{
std::string result;
replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
return result;
}
inline std::string replace_invalid(std::string_view s)
{
std::string result;
replace_invalid(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline bool starts_with_bom(std::string_view s)
{
return starts_with_bom(s.begin(), s.end());
}
} // namespace utf8
#endif // header guard

View File

@@ -0,0 +1,274 @@
// Copyright 2006 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include "core.h"
namespace utf8
{
namespace unchecked
{
template <typename octet_iterator>
octet_iterator append(uint32_t cp, octet_iterator result)
{
if (cp < 0x80) // one octet
*(result++) = static_cast<uint8_t>(cp);
else if (cp < 0x800) { // two octets
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else if (cp < 0x10000) { // three octets
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else { // four octets
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
return result;
}
template <typename octet_iterator, typename output_iterator>
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
{
while (start != end) {
octet_iterator sequence_start = start;
internal::utf_error err_code = utf8::internal::validate_next(start, end);
switch (err_code) {
case internal::UTF8_OK :
for (octet_iterator it = sequence_start; it != start; ++it)
*out++ = *it;
break;
case internal::NOT_ENOUGH_ROOM:
out = utf8::unchecked::append (replacement, out);
start = end;
break;
case internal::INVALID_LEAD:
out = utf8::unchecked::append (replacement, out);
++start;
break;
case internal::INCOMPLETE_SEQUENCE:
case internal::OVERLONG_SEQUENCE:
case internal::INVALID_CODE_POINT:
out = utf8::unchecked::append (replacement, out);
++start;
// just one replacement mark for the sequence
while (start != end && utf8::internal::is_trail(*start))
++start;
break;
}
}
return out;
}
template <typename octet_iterator, typename output_iterator>
inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
{
static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
return utf8::unchecked::replace_invalid(start, end, out, replacement_marker);
}
template <typename octet_iterator>
uint32_t next(octet_iterator& it)
{
uint32_t cp = utf8::internal::mask8(*it);
typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
switch (length) {
case 1:
break;
case 2:
it++;
cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
break;
case 3:
++it;
cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
++it;
cp += (*it) & 0x3f;
break;
case 4:
++it;
cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
++it;
cp += (utf8::internal::mask8(*it) << 6) & 0xfff;
++it;
cp += (*it) & 0x3f;
break;
}
++it;
return cp;
}
template <typename octet_iterator>
uint32_t peek_next(octet_iterator it)
{
return utf8::unchecked::next(it);
}
template <typename octet_iterator>
uint32_t prior(octet_iterator& it)
{
while (utf8::internal::is_trail(*(--it))) ;
octet_iterator temp = it;
return utf8::unchecked::next(temp);
}
template <typename octet_iterator, typename distance_type>
void advance (octet_iterator& it, distance_type n)
{
const distance_type zero(0);
if (n < zero) {
// backward
for (distance_type i = n; i < zero; ++i)
utf8::unchecked::prior(it);
} else {
// forward
for (distance_type i = zero; i < n; ++i)
utf8::unchecked::next(it);
}
}
template <typename octet_iterator>
typename std::iterator_traits<octet_iterator>::difference_type
distance (octet_iterator first, octet_iterator last)
{
typename std::iterator_traits<octet_iterator>::difference_type dist;
for (dist = 0; first < last; ++dist)
utf8::unchecked::next(first);
return dist;
}
template <typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
while (start != end) {
uint32_t cp = utf8::internal::mask16(*start++);
// Take care of surrogate pairs first
if (utf8::internal::is_lead_surrogate(cp)) {
uint32_t trail_surrogate = utf8::internal::mask16(*start++);
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
}
result = utf8::unchecked::append(cp, result);
}
return result;
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
{
while (start < end) {
uint32_t cp = utf8::unchecked::next(start);
if (cp > 0xffff) { //make a surrogate pair
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
}
else
*result++ = static_cast<uint16_t>(cp);
}
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
{
while (start != end)
result = utf8::unchecked::append(*(start++), result);
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
{
while (start < end)
(*result++) = utf8::unchecked::next(start);
return result;
}
// The iterator class
template <typename octet_iterator>
class iterator {
octet_iterator it;
public:
typedef uint32_t value_type;
typedef uint32_t* pointer;
typedef uint32_t& reference;
typedef std::ptrdiff_t difference_type;
typedef std::bidirectional_iterator_tag iterator_category;
iterator () {}
explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
// the default "big three" are OK
octet_iterator base () const { return it; }
uint32_t operator * () const
{
octet_iterator temp = it;
return utf8::unchecked::next(temp);
}
bool operator == (const iterator& rhs) const
{
return (it == rhs.it);
}
bool operator != (const iterator& rhs) const
{
return !(operator == (rhs));
}
iterator& operator ++ ()
{
::std::advance(it, utf8::internal::sequence_length(it));
return *this;
}
iterator operator ++ (int)
{
iterator temp = *this;
::std::advance(it, utf8::internal::sequence_length(it));
return temp;
}
iterator& operator -- ()
{
utf8::unchecked::prior(it);
return *this;
}
iterator operator -- (int)
{
iterator temp = *this;
utf8::unchecked::prior(it);
return temp;
}
}; // class iterator
} // namespace utf8::unchecked
} // namespace utf8
#endif // header guard

View File

@@ -0,0 +1,40 @@
#ifndef WAVFILE_H_
#define WAVFILE_H_
#include <iostream>
struct WavHeader {
uint8_t RIFF[4] = {'R', 'I', 'F', 'F'};
uint32_t chunkSize;
uint8_t WAVE[4] = {'W', 'A', 'V', 'E'};
// fmt
uint8_t fmt[4] = {'f', 'm', 't', ' '};
uint32_t fmtSize = 16; // bytes
uint16_t audioFormat = 1; // PCM
uint16_t numChannels; // mono
uint32_t sampleRate; // Hertz
uint32_t bytesPerSec; // sampleRate * sampleWidth
uint16_t blockAlign = 2; // 16-bit mono
uint16_t bitsPerSample = 16;
// data
uint8_t data[4] = {'d', 'a', 't', 'a'};
uint32_t dataSize;
};
// Write WAV file header only
void writeWavHeader(int sampleRate, int sampleWidth, int channels,
uint32_t numSamples, std::ostream &audioFile) {
WavHeader header;
header.dataSize = numSamples * sampleWidth * channels;
header.chunkSize = header.dataSize + sizeof(WavHeader) - 8;
header.sampleRate = sampleRate;
header.numChannels = channels;
header.bytesPerSec = sampleRate * sampleWidth * channels;
header.blockAlign = sampleWidth * channels;
audioFile.write(reinterpret_cast<const char *>(&header), sizeof(header));
} /* writeWavHeader */
#endif // WAVFILE_H_

View File

@@ -0,0 +1 @@
*

View File

@@ -0,0 +1,6 @@
FROM nvcr.io/nvidia/pytorch:22.03-py3
RUN pip3 install \
'pytorch-lightning~=1.7.0'
ENV NUMBA_CACHE_DIR=.numba_cache

View File

View File

@@ -0,0 +1,13 @@
#!/usr/bin/env bash
set -eo pipefail
this_dir="$( cd "$( dirname "$0" )" && pwd )"
if [ -d "${this_dir}/.venv" ]; then
source "${this_dir}/.venv/bin/activate"
fi
cd "${this_dir}/piper_train/vits/monotonic_align"
mkdir -p monotonic_align
cythonize -i core.pyx
mv core*.so monotonic_align/

View File

@@ -0,0 +1,11 @@
[mypy]
[mypy-setuptools.*]
ignore_missing_imports = True
[mypy-librosa.*]
ignore_missing_imports = True
[mypy-onnxruntime.*]
ignore_missing_imports = True

View File

@@ -0,0 +1,11 @@
.DS_Store
.idea
*.log
tmp/
*.py[cod]
*.egg
build
htmlcov
.venv/

View File

@@ -0,0 +1,6 @@
[settings]
multi_line_output=3
include_trailing_comma=True
force_grid_wrap=0
use_parentheses=True
line_length=88

View File

@@ -0,0 +1 @@
1.0.0

View File

@@ -0,0 +1,147 @@
import argparse
import json
import logging
from pathlib import Path
import torch
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from .vits.lightning import VitsModel
_LOGGER = logging.getLogger(__package__)
def main():
logging.basicConfig(level=logging.DEBUG)
parser = argparse.ArgumentParser()
parser.add_argument(
"--dataset-dir", required=True, help="Path to pre-processed dataset directory"
)
parser.add_argument(
"--checkpoint-epochs",
type=int,
help="Save checkpoint every N epochs (default: 1)",
)
parser.add_argument(
"--quality",
default="medium",
choices=("x-low", "medium", "high"),
help="Quality/size of model (default: medium)",
)
parser.add_argument(
"--resume_from_single_speaker_checkpoint",
help="For multi-speaker models only. Converts a single-speaker checkpoint to multi-speaker and resumes training",
)
Trainer.add_argparse_args(parser)
VitsModel.add_model_specific_args(parser)
parser.add_argument("--seed", type=int, default=1234)
args = parser.parse_args()
_LOGGER.debug(args)
args.dataset_dir = Path(args.dataset_dir)
if not args.default_root_dir:
args.default_root_dir = args.dataset_dir
torch.backends.cudnn.benchmark = True
torch.manual_seed(args.seed)
config_path = args.dataset_dir / "config.json"
dataset_path = args.dataset_dir / "dataset.jsonl"
with open(config_path, "r", encoding="utf-8") as config_file:
# See preprocess.py for format
config = json.load(config_file)
num_symbols = int(config["num_symbols"])
num_speakers = int(config["num_speakers"])
sample_rate = int(config["audio"]["sample_rate"])
trainer = Trainer.from_argparse_args(args)
if args.checkpoint_epochs is not None:
trainer.callbacks = [ModelCheckpoint(every_n_epochs=args.checkpoint_epochs)]
_LOGGER.debug(
"Checkpoints will be saved every %s epoch(s)", args.checkpoint_epochs
)
dict_args = vars(args)
if args.quality == "x-low":
dict_args["hidden_channels"] = 96
dict_args["inter_channels"] = 96
dict_args["filter_channels"] = 384
elif args.quality == "high":
dict_args["resblock"] = "1"
dict_args["resblock_kernel_sizes"] = (3, 7, 11)
dict_args["resblock_dilation_sizes"] = (
(1, 3, 5),
(1, 3, 5),
(1, 3, 5),
)
dict_args["upsample_rates"] = (8, 8, 2, 2)
dict_args["upsample_initial_channel"] = 512
dict_args["upsample_kernel_sizes"] = (16, 16, 4, 4)
model = VitsModel(
num_symbols=num_symbols,
num_speakers=num_speakers,
sample_rate=sample_rate,
dataset=[dataset_path],
**dict_args,
)
if args.resume_from_single_speaker_checkpoint:
assert (
num_speakers > 1
), "--resume_from_single_speaker_checkpoint is only for multi-speaker models. Use --resume_from_checkpoint for single-speaker models."
# Load single-speaker checkpoint
_LOGGER.debug(
"Resuming from single-speaker checkpoint: %s",
args.resume_from_single_speaker_checkpoint,
)
model_single = VitsModel.load_from_checkpoint(
args.resume_from_single_speaker_checkpoint,
dataset=None,
)
g_dict = model_single.model_g.state_dict()
for key in list(g_dict.keys()):
# Remove keys that can't be copied over due to missing speaker embedding
if (
key.startswith("dec.cond")
or key.startswith("dp.cond")
or ("enc.cond_layer" in key)
):
g_dict.pop(key, None)
# Copy over the multi-speaker model, excluding keys related to the
# speaker embedding (which is missing from the single-speaker model).
load_state_dict(model.model_g, g_dict)
load_state_dict(model.model_d, model_single.model_d.state_dict())
_LOGGER.info(
"Successfully converted single-speaker checkpoint to multi-speaker"
)
trainer.fit(model)
def load_state_dict(model, saved_state_dict):
state_dict = model.state_dict()
new_state_dict = {}
for k, v in state_dict.items():
if k in saved_state_dict:
# Use saved value
new_state_dict[k] = saved_state_dict[k]
else:
# Use initialized value
_LOGGER.debug("%s is not in the checkpoint", k)
new_state_dict[k] = v
model.load_state_dict(new_state_dict)
# -----------------------------------------------------------------------------
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,19 @@
"""Shared access to package resources"""
import os
import typing
from pathlib import Path
try:
import importlib.resources
files = importlib.resources.files
except (ImportError, AttributeError):
# Backport for Python < 3.9
import importlib_resources # type: ignore
files = importlib_resources.files
_PACKAGE = "piper_train"
_DIR = Path(typing.cast(os.PathLike, files(_PACKAGE)))
__version__ = (_DIR / "VERSION").read_text(encoding="utf-8").strip()

View File

@@ -0,0 +1,57 @@
#!/usr/bin/env python3
import json
import sys
import unicodedata
from collections import Counter
from .phonemize import DEFAULT_PHONEME_ID_MAP
def main() -> None:
used_phonemes: "Counter[str]" = Counter()
missing_phonemes: "Counter[str]" = Counter()
for line in sys.stdin:
line = line.strip()
if not line:
continue
utt = json.loads(line)
for phoneme in utt["phonemes"]:
used_phonemes[phoneme] += 1
if phoneme not in DEFAULT_PHONEME_ID_MAP:
missing_phonemes[phoneme] += 1
if missing_phonemes:
print("Missing", len(missing_phonemes), "phoneme(s)", file=sys.stderr)
json.dump(
{
"used": {
phoneme: {
"count": count,
"hex": f"\\u{hex(ord(phoneme))}",
"name": unicodedata.category(phoneme),
"category": unicodedata.category(phoneme),
}
for phoneme, count in used_phonemes.most_common()
},
"missing": {
phoneme: {
"count": count,
"hex": f"\\u{hex(ord(phoneme))}",
"name": unicodedata.category(phoneme),
"category": unicodedata.category(phoneme),
}
for phoneme, count in missing_phonemes.most_common()
},
},
sys.stdout,
)
# -----------------------------------------------------------------------------
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,50 @@
#!/usr/bin/env python3
import argparse
from concurrent.futures import ThreadPoolExecutor
import logging
from pathlib import Path
import torch
_LOGGER = logging.getLogger()
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"--cache-dir",
required=True,
help="Path to directory with audio/spectrogram files (*.pt)",
)
parser.add_argument(
"--delete", action="store_true", help="Delete files that fail to load"
)
parser.add_argument("--debug", action="store_true")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
_LOGGER.debug(args)
cache_dir = Path(args.cache_dir)
num_deleted = 0
def check_file(pt_path: Path) -> None:
nonlocal num_deleted
try:
_LOGGER.debug("Checking %s", pt_path)
torch.load(str(pt_path))
except Exception:
_LOGGER.error(pt_path)
if args.delete:
pt_path.unlink()
num_deleted += 1
with ThreadPoolExecutor() as executor:
for pt_path in cache_dir.glob("*.pt"):
executor.submit(check_file, pt_path)
print("Deleted:", num_deleted, "file(s)")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,56 @@
#!/usr/bin/env python3
import argparse
import logging
from pathlib import Path
import torch
from .vits.lightning import VitsModel
_LOGGER = logging.getLogger("piper_train.export_generator")
def main():
"""Main entry point"""
parser = argparse.ArgumentParser()
parser.add_argument("checkpoint", help="Path to model checkpoint (.ckpt)")
parser.add_argument("output", help="Path to output model (.pt)")
parser.add_argument(
"--debug", action="store_true", help="Print DEBUG messages to the console"
)
args = parser.parse_args()
if args.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
_LOGGER.debug(args)
# -------------------------------------------------------------------------
args.checkpoint = Path(args.checkpoint)
args.output = Path(args.output)
args.output.parent.mkdir(parents=True, exist_ok=True)
model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
model_g = model.model_g
# Inference only
model_g.eval()
with torch.no_grad():
model_g.dec.remove_weight_norm()
model_g.forward = model_g.infer
torch.save(model_g, args.output)
_LOGGER.info("Exported model to %s", args.output)
# -----------------------------------------------------------------------------
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python3
import argparse
import logging
from pathlib import Path
from typing import Optional
import torch
from .vits.lightning import VitsModel
_LOGGER = logging.getLogger("piper_train.export_onnx")
OPSET_VERSION = 15
def main() -> None:
"""Main entry point"""
torch.manual_seed(1234)
parser = argparse.ArgumentParser()
parser.add_argument("checkpoint", help="Path to model checkpoint (.ckpt)")
parser.add_argument("output", help="Path to output model (.onnx)")
parser.add_argument(
"--debug", action="store_true", help="Print DEBUG messages to the console"
)
args = parser.parse_args()
if args.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
_LOGGER.debug(args)
# -------------------------------------------------------------------------
args.checkpoint = Path(args.checkpoint)
args.output = Path(args.output)
args.output.parent.mkdir(parents=True, exist_ok=True)
model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
model_g = model.model_g
num_symbols = model_g.n_vocab
num_speakers = model_g.n_speakers
# Inference only
model_g.eval()
with torch.no_grad():
model_g.dec.remove_weight_norm()
# old_forward = model_g.infer
def infer_forward(text, text_lengths, scales, sid=None):
noise_scale = scales[0]
length_scale = scales[1]
noise_scale_w = scales[2]
audio = model_g.infer(
text,
text_lengths,
noise_scale=noise_scale,
length_scale=length_scale,
noise_scale_w=noise_scale_w,
sid=sid,
)[0].unsqueeze(1)
return audio
model_g.forward = infer_forward
dummy_input_length = 50
sequences = torch.randint(
low=0, high=num_symbols, size=(1, dummy_input_length), dtype=torch.long
)
sequence_lengths = torch.LongTensor([sequences.size(1)])
sid: Optional[torch.LongTensor] = None
if num_speakers > 1:
sid = torch.LongTensor([0])
# noise, noise_w, length
scales = torch.FloatTensor([0.667, 1.0, 0.8])
dummy_input = (sequences, sequence_lengths, scales, sid)
# Export
torch.onnx.export(
model=model_g,
args=dummy_input,
f=str(args.output),
verbose=False,
opset_version=OPSET_VERSION,
input_names=["input", "input_lengths", "scales", "sid"],
output_names=["output"],
dynamic_axes={
"input": {0: "batch_size", 1: "phonemes"},
"input_lengths": {0: "batch_size"},
"output": {0: "batch_size", 1: "time"},
},
)
_LOGGER.info("Exported model to %s", args.output)
# -----------------------------------------------------------------------------
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,196 @@
#!/usr/bin/env python3
import argparse
import logging
import os
from pathlib import Path
from typing import Optional
import torch
from torch import nn
from .vits import commons
from .vits.lightning import VitsModel
_LOGGER = logging.getLogger("piper_train.export_onnx")
OPSET_VERSION = 15
class VitsEncoder(nn.Module):
def __init__(self, gen):
super().__init__()
self.gen = gen
def forward(self, x, x_lengths, scales, sid=None):
noise_scale = scales[0]
length_scale = scales[1]
noise_scale_w = scales[2]
gen = self.gen
x, m_p, logs_p, x_mask = gen.enc_p(x, x_lengths)
if gen.n_speakers > 1:
assert sid is not None, "Missing speaker id"
g = gen.emb_g(sid).unsqueeze(-1) # [b, h, 1]
else:
g = None
if gen.use_sdp:
logw = gen.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
else:
logw = gen.dp(x, x_mask, g=g)
w = torch.exp(logw) * x_mask * length_scale
w_ceil = torch.ceil(w)
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
y_mask = torch.unsqueeze(
commons.sequence_mask(y_lengths, y_lengths.max()), 1
).type_as(x_mask)
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
attn = commons.generate_path(w_ceil, attn_mask)
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
1, 2
) # [b, t', t], [b, t, d] -> [b, d, t']
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
1, 2
) # [b, t', t], [b, t, d] -> [b, d, t']
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
return z_p, y_mask, g
class VitsDecoder(nn.Module):
def __init__(self, gen):
super().__init__()
self.gen = gen
def forward(self, z, y_mask, g=None):
z = self.gen.flow(z, y_mask, g=g, reverse=True)
output = self.gen.dec((z * y_mask), g=g)
return output
def main() -> None:
"""Main entry point"""
torch.manual_seed(1234)
parser = argparse.ArgumentParser()
parser.add_argument("checkpoint", help="Path to model checkpoint (.ckpt)")
parser.add_argument("output_dir", help="Path to output directory")
parser.add_argument(
"--debug", action="store_true", help="Print DEBUG messages to the console"
)
args = parser.parse_args()
if args.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
_LOGGER.debug(args)
# -------------------------------------------------------------------------
args.checkpoint = Path(args.checkpoint)
args.output_dir = Path(args.output_dir)
args.output_dir.mkdir(parents=True, exist_ok=True)
model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
model_g = model.model_g
with torch.no_grad():
model_g.dec.remove_weight_norm()
_LOGGER.info("Exporting encoder...")
decoder_input = export_encoder(args, model_g)
_LOGGER.info("Exporting decoder...")
export_decoder(args, model_g, decoder_input)
_LOGGER.info("Exported model to %s", str(args.output_dir))
def export_encoder(args, model_g):
model = VitsEncoder(model_g)
model.eval()
num_symbols = model_g.n_vocab
num_speakers = model_g.n_speakers
dummy_input_length = 50
sequences = torch.randint(
low=0, high=num_symbols, size=(1, dummy_input_length), dtype=torch.long
)
sequence_lengths = torch.LongTensor([sequences.size(1)])
sid: Optional[torch.LongTensor] = None
if num_speakers > 1:
sid = torch.LongTensor([0])
# noise, noise_w, length
scales = torch.FloatTensor([0.667, 1.0, 0.8])
dummy_input = (sequences, sequence_lengths, scales, sid)
output_names = [
"z",
"y_mask",
]
if model_g.n_speakers > 1:
output_names.append("g")
onnx_path = os.fspath(args.output_dir.joinpath("encoder.onnx"))
# Export
torch.onnx.export(
model=model,
args=dummy_input,
f=onnx_path,
verbose=False,
opset_version=OPSET_VERSION,
input_names=["input", "input_lengths", "scales", "sid"],
output_names=output_names,
dynamic_axes={
"input": {0: "batch_size", 1: "phonemes"},
"input_lengths": {0: "batch_size"},
"output": {0: "batch_size", 2: "time"},
},
)
_LOGGER.info("Exported encoder to %s", onnx_path)
return model(*dummy_input)
def export_decoder(args, model_g, decoder_input):
model = VitsDecoder(model_g)
model.eval()
input_names = [
"z",
"y_mask",
]
if model_g.n_speakers > 1:
input_names.append("g")
onnx_path = os.fspath(args.output_dir.joinpath("decoder.onnx"))
# Export
torch.onnx.export(
model=model,
args=decoder_input,
f=onnx_path,
verbose=False,
opset_version=OPSET_VERSION,
input_names=input_names,
output_names=["output"],
dynamic_axes={
"z": {0: "batch_size", 2: "time"},
"y_mask": {0: "batch_size", 2: "time"},
"output": {0: "batch_size", 1: "time"},
},
)
_LOGGER.info("Exported decoder to %s", onnx_path)
# -----------------------------------------------------------------------------
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,78 @@
#!/usr/bin/env python3
import argparse
import logging
from pathlib import Path
import torch
from .vits.lightning import VitsModel
_LOGGER = logging.getLogger("piper_train.export_torchscript")
def main():
"""Main entry point"""
torch.manual_seed(1234)
parser = argparse.ArgumentParser()
parser.add_argument("checkpoint", help="Path to model checkpoint (.ckpt)")
parser.add_argument("output", help="Path to output model (.onnx)")
parser.add_argument(
"--debug", action="store_true", help="Print DEBUG messages to the console"
)
args = parser.parse_args()
if args.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
_LOGGER.debug(args)
# -------------------------------------------------------------------------
args.checkpoint = Path(args.checkpoint)
args.output = Path(args.output)
args.output.parent.mkdir(parents=True, exist_ok=True)
model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
model_g = model.model_g
num_symbols = model_g.n_vocab
# Inference only
model_g.eval()
with torch.no_grad():
model_g.dec.remove_weight_norm()
model_g.forward = model_g.infer
dummy_input_length = 50
sequences = torch.randint(
low=0, high=num_symbols, size=(1, dummy_input_length), dtype=torch.long
)
sequence_lengths = torch.LongTensor([sequences.size(1)])
sid = torch.LongTensor([0])
dummy_input = (
sequences,
sequence_lengths,
sid,
torch.FloatTensor([0.667]),
torch.FloatTensor([1.0]),
torch.FloatTensor([0.8]),
)
jitted_model = torch.jit.trace(model_g, dummy_input)
torch.jit.save(jitted_model, str(args.output))
_LOGGER.info("Saved TorchScript model to %s", args.output)
# -----------------------------------------------------------------------------
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,244 @@
#!/usr/bin/env python3
import argparse
import csv
import json
import re
import shutil
import statistics
import subprocess
import sys
import threading
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from dataclasses import asdict, dataclass
from enum import Enum
from pathlib import Path
from typing import Optional
import numpy as np
from .norm_audio import make_silence_detector, trim_silence
_DIR = Path(__file__).parent
# Removed from the speaking rate calculation
_PUNCTUATION = re.compile(".。,?¿?؟!;:-—")
class ExcludeReason(str, Enum):
MISSING = "file_missing"
EMPTY = "file_empty"
LOW = "rate_low"
HIGH = "rate_high"
@dataclass
class Utterance:
id: str
text: str
duration_sec: float
speaker: str
exclude_reason: Optional[ExcludeReason] = None
rate: float = 0.0
def __post_init__(self):
if self.duration_sec > 0:
# Don't include punctuation is speaking rate calculation since we
# remove silence.
text_nopunct = _PUNCTUATION.sub("", self.text)
self.rate = len(text_nopunct) / self.duration_sec
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--write-json", help="Path to write information about excluded utterances"
)
parser.add_argument(
"--dataset-dir", default=Path.cwd(), help="Path to dataset directory"
)
parser.add_argument("--scale-lower", type=float, default=2.0)
parser.add_argument("--scale-upper", type=float, default=2.0)
args = parser.parse_args()
if not shutil.which("ffprobe"):
raise RuntimeError("ffprobe not found (is ffmpeg installed?)")
dataset_dir = Path(args.dataset_dir)
wav_dir = dataset_dir / "wav"
if not wav_dir.is_dir():
wav_dir = dataset_dir / "wavs"
reader = csv.reader(sys.stdin, delimiter="|")
text_and_audio = []
for row in reader:
filename, text = row[0], row[-1]
speaker = row[1] if len(row) > 2 else "default"
# Try file name relative to metadata
wav_path = dataset_dir / filename
if not wav_path.exists():
# Try with .wav
wav_path = dataset_dir / f"{filename}.wav"
if not wav_path.exists():
# Try wav/ or wavs/
wav_path = wav_dir / filename
if not wav_path.exists():
# Try with .wav
wav_path = wav_dir / f"{filename}.wav"
text_and_audio.append((filename, text, wav_path, speaker))
writer = csv.writer(sys.stdout, delimiter="|")
# speaker -> [rate]
utts_by_speaker = defaultdict(list)
process_utterance = ProcessUtterance()
with ThreadPoolExecutor() as executor:
for utt in executor.map(lambda args: process_utterance(*args), text_and_audio):
utts_by_speaker[utt.speaker].append(utt)
is_multispeaker = len(utts_by_speaker) > 1
writer = csv.writer(sys.stdout, delimiter="|")
speaker_details = {}
for speaker, utts in utts_by_speaker.items():
rates = [utt.rate for utt in utts]
if rates:
# Exclude rates well outside the 25%/75% quantiles
rate_qs = statistics.quantiles(rates, n=4)
q1 = rate_qs[0] # 25%
q3 = rate_qs[-1] # 75%
iqr = q3 - q1
lower = q1 - (args.scale_lower * iqr)
upper = q3 + (args.scale_upper * iqr)
speaker_details[speaker] = {
"min": min(rates),
"max": max(rates),
"quanties": rate_qs,
"lower": lower,
"upper": upper,
}
for utt in utts:
if utt.rate < lower:
utt.exclude_reason = ExcludeReason.LOW
elif utt.rate > upper:
utt.exclude_reason = ExcludeReason.HIGH
else:
if is_multispeaker:
writer.writerow((utt.id, utt.speaker, utt.text))
else:
writer.writerow((utt.id, utt.text))
if args.write_json:
speaker_excluded = {
speaker: [
asdict(utt)
for utt in utts_by_speaker[speaker]
if utt.exclude_reason is not None
]
for speaker in speaker_details
}
with open(args.write_json, "w", encoding="utf-8") as json_file:
json.dump(
{
speaker: {
"details": speaker_details[speaker],
"num_utterances": len(utts_by_speaker[speaker]),
"num_excluded": len(speaker_excluded[speaker]),
"excluded": speaker_excluded[speaker],
}
for speaker in speaker_details
},
json_file,
indent=4,
ensure_ascii=False,
)
class ProcessUtterance:
def __init__(self):
self.thread_data = threading.local()
def __call__(
self, utt_id: str, text: str, wav_path: Path, speaker: str
) -> Utterance:
if not wav_path.exists():
return Utterance(
utt_id,
text,
0.0,
speaker,
exclude_reason=ExcludeReason.MISSING,
)
if wav_path.stat().st_size == 0:
return Utterance(
utt_id,
text,
0.0,
speaker,
exclude_reason=ExcludeReason.EMPTY,
)
return Utterance(utt_id, text, self.get_duration(wav_path), speaker)
def get_duration(self, audio_path: Path) -> float:
"""Uses ffmpeg to get audio duration."""
if not hasattr(self.thread_data, "detector"):
self.thread_data.detector = make_silence_detector()
vad_sample_rate = 16000
audio_16khz_bytes = subprocess.check_output(
[
"ffmpeg",
"-i",
str(audio_path),
"-f",
"s16le",
"-acodec",
"pcm_s16le",
"-ac",
"1",
"-ar",
str(vad_sample_rate),
"pipe:",
],
stderr=subprocess.DEVNULL,
)
# Normalize
audio_16khz = np.frombuffer(audio_16khz_bytes, dtype=np.int16).astype(
np.float32
)
audio_16khz /= np.abs(np.max(audio_16khz))
# Get speaking duration
offset_sec, duration_sec = trim_silence(
audio_16khz,
self.thread_data.detector,
threshold=0.8,
samples_per_chunk=480,
sample_rate=vad_sample_rate,
keep_chunks_before=2,
keep_chunks_after=2,
)
if duration_sec is None:
# Speech goes to end of audio
if len(audio_16khz) > 0:
duration_sec = (len(audio_16khz) / 16000.0) - offset_sec
else:
duration_sec = 0.0
return duration_sec
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,84 @@
#!/usr/bin/env python3
import argparse
import json
import logging
import sys
import time
from pathlib import Path
import torch
from .vits.lightning import VitsModel
from .vits.utils import audio_float_to_int16
from .vits.wavfile import write as write_wav
_LOGGER = logging.getLogger("piper_train.infer")
def main():
"""Main entry point"""
logging.basicConfig(level=logging.DEBUG)
parser = argparse.ArgumentParser(prog="piper_train.infer")
parser.add_argument(
"--checkpoint", required=True, help="Path to model checkpoint (.ckpt)"
)
parser.add_argument("--output-dir", required=True, help="Path to write WAV files")
parser.add_argument("--sample-rate", type=int, default=22050)
#
parser.add_argument("--noise-scale", type=float, default=0.667)
parser.add_argument("--length-scale", type=float, default=1.0)
parser.add_argument("--noise-w", type=float, default=0.8)
#
args = parser.parse_args()
args.output_dir = Path(args.output_dir)
args.output_dir.mkdir(parents=True, exist_ok=True)
model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
# Inference only
model.eval()
with torch.no_grad():
model.model_g.dec.remove_weight_norm()
for i, line in enumerate(sys.stdin):
line = line.strip()
if not line:
continue
utt = json.loads(line)
utt_id = str(i)
phoneme_ids = utt["phoneme_ids"]
speaker_id = utt.get("speaker_id")
text = torch.LongTensor(phoneme_ids).unsqueeze(0)
text_lengths = torch.LongTensor([len(phoneme_ids)])
scales = [args.noise_scale, args.length_scale, args.noise_w]
sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
start_time = time.perf_counter()
audio = model(text, text_lengths, scales, sid=sid).detach().numpy()
audio = audio_float_to_int16(audio)
end_time = time.perf_counter()
audio_duration_sec = audio.shape[-1] / args.sample_rate
infer_sec = end_time - start_time
real_time_factor = (
infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
)
_LOGGER.debug(
"Real-time factor for %s: %0.2f (infer=%0.2f sec, audio=%0.2f sec)",
i + 1,
real_time_factor,
infer_sec,
audio_duration_sec,
)
output_path = args.output_dir / f"{utt_id}.wav"
write_wav(str(output_path), args.sample_rate, audio)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,83 @@
#!/usr/bin/env python3
import argparse
import json
import logging
import sys
import time
from pathlib import Path
import torch
from .vits.utils import audio_float_to_int16
from .vits.wavfile import write as write_wav
_LOGGER = logging.getLogger("piper_train.infer_generator")
def main():
"""Main entry point"""
logging.basicConfig(level=logging.DEBUG)
parser = argparse.ArgumentParser(prog="piper_train.infer_generator")
parser.add_argument("--model", required=True, help="Path to generator (.pt)")
parser.add_argument("--output-dir", required=True, help="Path to write WAV files")
parser.add_argument("--sample-rate", type=int, default=22050)
args = parser.parse_args()
args.output_dir = Path(args.output_dir)
args.output_dir.mkdir(parents=True, exist_ok=True)
model = torch.load(args.model)
# Inference only
model.eval()
for i, line in enumerate(sys.stdin):
line = line.strip()
if not line:
continue
utt = json.loads(line)
utt_id = str(i)
phoneme_ids = utt["phoneme_ids"]
speaker_id = utt.get("speaker_id")
text = torch.LongTensor(phoneme_ids).unsqueeze(0)
text_lengths = torch.LongTensor([len(phoneme_ids)])
sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
start_time = time.perf_counter()
audio = (
model(
text,
text_lengths,
sid,
# torch.FloatTensor([0.667]),
# torch.FloatTensor([1.0]),
# torch.FloatTensor([0.8]),
)[0]
.detach()
.numpy()
)
audio = audio_float_to_int16(audio)
end_time = time.perf_counter()
audio_duration_sec = audio.shape[-1] / args.sample_rate
infer_sec = end_time - start_time
real_time_factor = (
infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
)
_LOGGER.debug(
"Real-time factor for %s: %0.2f (infer=%0.2f sec, audio=%0.2f sec)",
i + 1,
real_time_factor,
infer_sec,
audio_duration_sec,
)
output_path = args.output_dir / f"{utt_id}.wav"
write_wav(str(output_path), args.sample_rate, audio)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,200 @@
#!/usr/bin/env python3
import argparse
import json
import logging
import math
import sys
import time
from pathlib import Path
import numpy as np
import onnxruntime
from .vits.utils import audio_float_to_int16
from .vits.wavfile import write as write_wav
_LOGGER = logging.getLogger("piper_train.infer_onnx")
def main():
"""Main entry point"""
logging.basicConfig(level=logging.DEBUG)
parser = argparse.ArgumentParser(prog="piper_train.infer_onnx")
parser.add_argument("--model", required=True, help="Path to model (.onnx)")
parser.add_argument("--output-dir", required=True, help="Path to write WAV files")
parser.add_argument("--sample-rate", type=int, default=22050)
parser.add_argument("--noise-scale", type=float, default=0.667)
parser.add_argument("--noise-scale-w", type=float, default=0.8)
parser.add_argument("--length-scale", type=float, default=1.0)
args = parser.parse_args()
args.output_dir = Path(args.output_dir)
args.output_dir.mkdir(parents=True, exist_ok=True)
sess_options = onnxruntime.SessionOptions()
_LOGGER.debug("Loading model from %s", args.model)
model = onnxruntime.InferenceSession(str(args.model), sess_options=sess_options)
_LOGGER.info("Loaded model from %s", args.model)
# text_empty = np.zeros((1, 300), dtype=np.int64)
# text_lengths_empty = np.array([text_empty.shape[1]], dtype=np.int64)
# scales = np.array(
# [args.noise_scale, args.length_scale, args.noise_scale_w],
# dtype=np.float32,
# )
# bias_audio = model.run(
# None,
# {"input": text_empty, "input_lengths": text_lengths_empty, "scales": scales},
# )[0].squeeze((0, 1))
# bias_spec, _ = transform(bias_audio)
for i, line in enumerate(sys.stdin):
line = line.strip()
if not line:
continue
utt = json.loads(line)
# utt_id = utt["id"]
utt_id = str(i)
phoneme_ids = utt["phoneme_ids"]
speaker_id = utt.get("speaker_id")
text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
text_lengths = np.array([text.shape[1]], dtype=np.int64)
scales = np.array(
[args.noise_scale, args.length_scale, args.noise_scale_w],
dtype=np.float32,
)
sid = None
if speaker_id is not None:
sid = np.array([speaker_id], dtype=np.int64)
start_time = time.perf_counter()
audio = model.run(
None,
{
"input": text,
"input_lengths": text_lengths,
"scales": scales,
"sid": sid,
},
)[0].squeeze((0, 1))
# audio = denoise(audio, bias_spec, 10)
audio = audio_float_to_int16(audio.squeeze())
end_time = time.perf_counter()
audio_duration_sec = audio.shape[-1] / args.sample_rate
infer_sec = end_time - start_time
real_time_factor = (
infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
)
_LOGGER.debug(
"Real-time factor for %s: %0.2f (infer=%0.2f sec, audio=%0.2f sec)",
i + 1,
real_time_factor,
infer_sec,
audio_duration_sec,
)
output_path = args.output_dir / f"{utt_id}.wav"
write_wav(str(output_path), args.sample_rate, audio)
def denoise(
audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float
) -> np.ndarray:
audio_spec, audio_angles = transform(audio)
a = bias_spec.shape[-1]
b = audio_spec.shape[-1]
repeats = max(1, math.ceil(b / a))
bias_spec_repeat = np.repeat(bias_spec, repeats, axis=-1)[..., :b]
audio_spec_denoised = audio_spec - (bias_spec_repeat * denoiser_strength)
audio_spec_denoised = np.clip(audio_spec_denoised, a_min=0.0, a_max=None)
audio_denoised = inverse(audio_spec_denoised, audio_angles)
return audio_denoised
def stft(x, fft_size, hopsamp):
"""Compute and return the STFT of the supplied time domain signal x.
Args:
x (1-dim Numpy array): A time domain signal.
fft_size (int): FFT size. Should be a power of 2, otherwise DFT will be used.
hopsamp (int):
Returns:
The STFT. The rows are the time slices and columns are the frequency bins.
"""
window = np.hanning(fft_size)
fft_size = int(fft_size)
hopsamp = int(hopsamp)
return np.array(
[
np.fft.rfft(window * x[i : i + fft_size])
for i in range(0, len(x) - fft_size, hopsamp)
]
)
def istft(X, fft_size, hopsamp):
"""Invert a STFT into a time domain signal.
Args:
X (2-dim Numpy array): Input spectrogram. The rows are the time slices and columns are the frequency bins.
fft_size (int):
hopsamp (int): The hop size, in samples.
Returns:
The inverse STFT.
"""
fft_size = int(fft_size)
hopsamp = int(hopsamp)
window = np.hanning(fft_size)
time_slices = X.shape[0]
len_samples = int(time_slices * hopsamp + fft_size)
x = np.zeros(len_samples)
for n, i in enumerate(range(0, len(x) - fft_size, hopsamp)):
x[i : i + fft_size] += window * np.real(np.fft.irfft(X[n]))
return x
def inverse(magnitude, phase):
recombine_magnitude_phase = np.concatenate(
[magnitude * np.cos(phase), magnitude * np.sin(phase)], axis=1
)
x_org = recombine_magnitude_phase
n_b, n_f, n_t = x_org.shape # pylint: disable=unpacking-non-sequence
x = np.empty([n_b, n_f // 2, n_t], dtype=np.complex64)
x.real = x_org[:, : n_f // 2]
x.imag = x_org[:, n_f // 2 :]
inverse_transform = []
for y in x:
y_ = istft(y.T, fft_size=1024, hopsamp=256)
inverse_transform.append(y_[None, :])
inverse_transform = np.concatenate(inverse_transform, 0)
return inverse_transform
def transform(input_data):
x = input_data
real_part = []
imag_part = []
for y in x:
y_ = stft(y, fft_size=1024, hopsamp=256).T
real_part.append(y_.real[None, :, :]) # pylint: disable=unsubscriptable-object
imag_part.append(y_.imag[None, :, :]) # pylint: disable=unsubscriptable-object
real_part = np.concatenate(real_part, 0)
imag_part = np.concatenate(imag_part, 0)
magnitude = np.sqrt(real_part**2 + imag_part**2)
phase = np.arctan2(imag_part.data, real_part.data)
return magnitude, phase
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,295 @@
#!/usr/bin/env python3
import argparse
import json
import logging
import math
import os
import sys
import time
from pathlib import Path
import numpy as np
import onnxruntime
from .vits.utils import audio_float_to_int16
_LOGGER = logging.getLogger("piper_train.infer_onnx")
class SpeechStreamer:
"""
Stream speech in real time.
Args:
encoder_path: path to encoder ONNX model
decoder_path: path to decoder ONNX model
sample_rate: output sample rate
chunk_size: number of mel frames to decode in each steps (time in secs = chunk_size * 256)
chunk_padding: number of mel frames to be concatinated to the start and end of the current chunk to reduce decoding artifacts
"""
def __init__(
self,
encoder_path,
decoder_path,
sample_rate,
chunk_size=45,
chunk_padding=10,
):
sess_options = onnxruntime.SessionOptions()
_LOGGER.debug("Loading encoder model from %s", encoder_path)
self.encoder = onnxruntime.InferenceSession(
encoder_path, sess_options=sess_options
)
_LOGGER.debug("Loading decoder model from %s", decoder_path)
self.decoder = onnxruntime.InferenceSession(
decoder_path, sess_options=sess_options
)
self.sample_rate = sample_rate
self.chunk_size = chunk_size
self.chunk_padding = chunk_padding
def encoder_infer(self, enc_input):
ENC_START = time.perf_counter()
enc_output = self.encoder.run(None, enc_input)
ENC_INFER = time.perf_counter() - ENC_START
_LOGGER.debug(f"Encoder inference {round(ENC_INFER * 1000)}")
wav_length = enc_output[0].shape[2] * 256
enc_rtf = round(ENC_INFER / (wav_length / self.sample_rate), 2)
_LOGGER.debug(f"Encoder RTF {enc_rtf}")
return enc_output
def decoder_infer(self, z, y_mask, g=None):
dec_input = {"z": z, "y_mask": y_mask}
if g:
dec_input["g"] = g
DEC_START = time.perf_counter()
audio = self.decoder.run(None, dec_input)[0].squeeze()
DEC_INFER = time.perf_counter() - DEC_START
_LOGGER.debug(f"Decoder inference {round(DEC_INFER * 1000)}")
dec_rtf = round(DEC_INFER / (len(audio) / self.sample_rate), 2)
_LOGGER.debug(f"Decoder RTF {dec_rtf}")
return audio
def chunk(self, enc_output):
z, y_mask, *dec_args = enc_output
n_frames = z.shape[2]
if n_frames <= (self.chunk_size + (2 * self.chunk_padding)):
# Too short to stream
return self.decoder_infer(z, y_mask, *dec_args)
split_at = [
i * self.chunk_size for i in range(1, math.ceil(n_frames / self.chunk_size))
]
chunks = list(
zip(
np.split(z, split_at, axis=2),
np.split(y_mask, split_at, axis=2),
)
)
wav_start_pad = wav_end_pad = None
for idx, (z_chunk, y_mask_chunk) in enumerate(chunks):
if idx > 0:
prev_z, prev_y_mask = chunks[idx - 1]
start_zpad = prev_z[:, :, -self.chunk_padding :]
start_ypad = prev_y_mask[:, :, -self.chunk_padding :]
z_chunk = np.concatenate([start_zpad, z_chunk], axis=2)
y_mask_chunk = np.concatenate([start_ypad, y_mask_chunk], axis=2)
wav_start_pad = start_zpad.shape[2] * 256
if (idx + 1) < len(chunks):
next_z, next_y_mask = chunks[idx + 1]
end_zpad = next_z[:, :, : self.chunk_padding]
end_ypad = next_y_mask[:, :, : self.chunk_padding]
z_chunk = np.concatenate([z_chunk, end_zpad], axis=2)
y_mask_chunk = np.concatenate([y_mask_chunk, end_ypad], axis=2)
wav_end_pad = end_zpad.shape[2] * 256
audio = self.decoder_infer(z_chunk, y_mask_chunk, *dec_args)
yield audio[wav_start_pad:-wav_end_pad]
def stream(self, encoder_input):
start_time = time.perf_counter()
has_shown_latency = False
_LOGGER.debug("Starting synthesis")
enc_output = self.encoder_infer(encoder_input)
for wav in self.chunk(enc_output):
if len(wav) == 0:
continue
if not has_shown_latency:
LATENCY = round((time.perf_counter() - start_time) * 1000)
_LOGGER.debug(f"Latency {LATENCY}")
has_shown_latency = True
audio = audio_float_to_int16(wav)
yield audio.tobytes()
_LOGGER.debug("Synthesis done!")
def main():
"""Main entry point"""
logging.basicConfig(level=logging.DEBUG)
parser = argparse.ArgumentParser(prog="piper_train.infer_onnx_streaming")
parser.add_argument(
"--encoder", required=True, help="Path to encoder model (.onnx)"
)
parser.add_argument(
"--decoder", required=True, help="Path to decoder model (.onnx)"
)
parser.add_argument("--sample-rate", type=int, default=22050)
parser.add_argument("--noise-scale", type=float, default=0.667)
parser.add_argument("--noise-scale-w", type=float, default=0.8)
parser.add_argument("--length-scale", type=float, default=1.0)
parser.add_argument(
"--chunk-size",
type=int,
default=45,
help="Number of mel frames to decode at each step"
)
parser.add_argument(
"--chunk-padding",
type=int,
default=5,
help="Number of mel frames to add to the start and end of the current chunk to reduce decoding artifacts"
)
args = parser.parse_args()
streamer = SpeechStreamer(
encoder_path=os.fspath(args.encoder),
decoder_path=os.fspath(args.decoder),
sample_rate=args.sample_rate,
chunk_size=args.chunk_size,
chunk_padding=args.chunk_padding,
)
output_buffer = sys.stdout.buffer
for i, line in enumerate(sys.stdin):
line = line.strip()
if not line:
continue
utt = json.loads(line)
utt_id = str(i)
phoneme_ids = utt["phoneme_ids"]
speaker_id = utt.get("speaker_id")
text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
text_lengths = np.array([text.shape[1]], dtype=np.int64)
scales = np.array(
[args.noise_scale, args.length_scale, args.noise_scale_w],
dtype=np.float32,
)
sid = None
if speaker_id is not None:
sid = np.array([speaker_id], dtype=np.int64)
stream = streamer.stream(
{
"input": text,
"input_lengths": text_lengths,
"scales": scales,
"sid": sid,
}
)
for wav_chunk in stream:
output_buffer.write(wav_chunk)
output_buffer.flush()
def denoise(
audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float
) -> np.ndarray:
audio_spec, audio_angles = transform(audio)
a = bias_spec.shape[-1]
b = audio_spec.shape[-1]
repeats = max(1, math.ceil(b / a))
bias_spec_repeat = np.repeat(bias_spec, repeats, axis=-1)[..., :b]
audio_spec_denoised = audio_spec - (bias_spec_repeat * denoiser_strength)
audio_spec_denoised = np.clip(audio_spec_denoised, a_min=0.0, a_max=None)
audio_denoised = inverse(audio_spec_denoised, audio_angles)
return audio_denoised
def stft(x, fft_size, hopsamp):
"""Compute and return the STFT of the supplied time domain signal x.
Args:
x (1-dim Numpy array): A time domain signal.
fft_size (int): FFT size. Should be a power of 2, otherwise DFT will be used.
hopsamp (int):
Returns:
The STFT. The rows are the time slices and columns are the frequency bins.
"""
window = np.hanning(fft_size)
fft_size = int(fft_size)
hopsamp = int(hopsamp)
return np.array(
[
np.fft.rfft(window * x[i : i + fft_size])
for i in range(0, len(x) - fft_size, hopsamp)
]
)
def istft(X, fft_size, hopsamp):
"""Invert a STFT into a time domain signal.
Args:
X (2-dim Numpy array): Input spectrogram. The rows are the time slices and columns are the frequency bins.
fft_size (int):
hopsamp (int): The hop size, in samples.
Returns:
The inverse STFT.
"""
fft_size = int(fft_size)
hopsamp = int(hopsamp)
window = np.hanning(fft_size)
time_slices = X.shape[0]
len_samples = int(time_slices * hopsamp + fft_size)
x = np.zeros(len_samples)
for n, i in enumerate(range(0, len(x) - fft_size, hopsamp)):
x[i : i + fft_size] += window * np.real(np.fft.irfft(X[n]))
return x
def inverse(magnitude, phase):
recombine_magnitude_phase = np.concatenate(
[magnitude * np.cos(phase), magnitude * np.sin(phase)], axis=1
)
x_org = recombine_magnitude_phase
n_b, n_f, n_t = x_org.shape # pylint: disable=unpacking-non-sequence
x = np.empty([n_b, n_f // 2, n_t], dtype=np.complex64)
x.real = x_org[:, : n_f // 2]
x.imag = x_org[:, n_f // 2 :]
inverse_transform = []
for y in x:
y_ = istft(y.T, fft_size=1024, hopsamp=256)
inverse_transform.append(y_[None, :])
inverse_transform = np.concatenate(inverse_transform, 0)
return inverse_transform
def transform(input_data):
x = input_data
real_part = []
imag_part = []
for y in x:
y_ = stft(y, fft_size=1024, hopsamp=256).T
real_part.append(y_.real[None, :, :]) # pylint: disable=unsubscriptable-object
imag_part.append(y_.imag[None, :, :]) # pylint: disable=unsubscriptable-object
real_part = np.concatenate(real_part, 0)
imag_part = np.concatenate(imag_part, 0)
magnitude = np.sqrt(real_part**2 + imag_part**2)
phase = np.arctan2(imag_part.data, real_part.data)
return magnitude, phase
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,85 @@
#!/usr/bin/env python3
import argparse
import json
import logging
import sys
import time
from pathlib import Path
import torch
from .vits.utils import audio_float_to_int16
from .vits.wavfile import write as write_wav
_LOGGER = logging.getLogger("piper_train.infer_torchscript")
def main():
"""Main entry point"""
logging.basicConfig(level=logging.DEBUG)
parser = argparse.ArgumentParser(prog="piper_train.infer_torchscript")
parser.add_argument(
"--model", required=True, help="Path to torchscript checkpoint (.ts)"
)
parser.add_argument("--output-dir", required=True, help="Path to write WAV files")
parser.add_argument("--sample-rate", type=int, default=22050)
args = parser.parse_args()
args.output_dir = Path(args.output_dir)
args.output_dir.mkdir(parents=True, exist_ok=True)
model = torch.jit.load(args.model)
# Inference only
model.eval()
for i, line in enumerate(sys.stdin):
line = line.strip()
if not line:
continue
utt = json.loads(line)
utt_id = str(i)
phoneme_ids = utt["phoneme_ids"]
speaker_id = utt.get("speaker_id")
text = torch.LongTensor(phoneme_ids).unsqueeze(0)
text_lengths = torch.LongTensor([len(phoneme_ids)])
sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
start_time = time.perf_counter()
audio = (
model(
text,
text_lengths,
sid,
torch.FloatTensor([0.667]),
torch.FloatTensor([1.0]),
torch.FloatTensor([0.8]),
)[0]
.detach()
.numpy()
)
audio = audio_float_to_int16(audio)
end_time = time.perf_counter()
audio_duration_sec = audio.shape[-1] / args.sample_rate
infer_sec = end_time - start_time
real_time_factor = (
infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
)
_LOGGER.debug(
"Real-time factor for %s: %0.2f (infer=%0.2f sec, audio=%0.2f sec)",
i + 1,
real_time_factor,
infer_sec,
audio_duration_sec,
)
output_path = args.output_dir / f"{utt_id}.wav"
write_wav(str(output_path), args.sample_rate, audio)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,92 @@
from hashlib import sha256
from pathlib import Path
from typing import Optional, Tuple, Union
import librosa
import torch
from piper_train.vits.mel_processing import spectrogram_torch
from .trim import trim_silence
from .vad import SileroVoiceActivityDetector
_DIR = Path(__file__).parent
def make_silence_detector() -> SileroVoiceActivityDetector:
silence_model = _DIR / "models" / "silero_vad.onnx"
return SileroVoiceActivityDetector(silence_model)
def cache_norm_audio(
audio_path: Union[str, Path],
cache_dir: Union[str, Path],
detector: SileroVoiceActivityDetector,
sample_rate: int,
silence_threshold: float = 0.2,
silence_samples_per_chunk: int = 480,
silence_keep_chunks_before: int = 2,
silence_keep_chunks_after: int = 2,
filter_length: int = 1024,
window_length: int = 1024,
hop_length: int = 256,
ignore_cache: bool = False,
) -> Tuple[Path, Path]:
audio_path = Path(audio_path).absolute()
cache_dir = Path(cache_dir)
# Cache id is the SHA256 of the full audio path
audio_cache_id = sha256(str(audio_path).encode()).hexdigest()
audio_norm_path = cache_dir / f"{audio_cache_id}.pt"
audio_spec_path = cache_dir / f"{audio_cache_id}.spec.pt"
# Normalize audio
audio_norm_tensor: Optional[torch.FloatTensor] = None
if ignore_cache or (not audio_norm_path.exists()):
# Trim silence first.
#
# The VAD model works on 16khz, so we determine the portion of audio
# to keep and then just load that with librosa.
vad_sample_rate = 16000
audio_16khz, _sr = librosa.load(path=audio_path, sr=vad_sample_rate)
offset_sec, duration_sec = trim_silence(
audio_16khz,
detector,
threshold=silence_threshold,
samples_per_chunk=silence_samples_per_chunk,
sample_rate=vad_sample_rate,
keep_chunks_before=silence_keep_chunks_before,
keep_chunks_after=silence_keep_chunks_after,
)
# NOTE: audio is already in [-1, 1] coming from librosa
audio_norm_array, _sr = librosa.load(
path=audio_path,
sr=sample_rate,
offset=offset_sec,
duration=duration_sec,
)
# Save to cache directory
audio_norm_tensor = torch.FloatTensor(audio_norm_array).unsqueeze(0)
torch.save(audio_norm_tensor, audio_norm_path)
# Compute spectrogram
if ignore_cache or (not audio_spec_path.exists()):
if audio_norm_tensor is None:
# Load pre-cached normalized audio
audio_norm_tensor = torch.load(audio_norm_path)
audio_spec_tensor = spectrogram_torch(
y=audio_norm_tensor,
n_fft=filter_length,
sampling_rate=sample_rate,
hop_size=hop_length,
win_size=window_length,
center=False,
).squeeze(0)
torch.save(audio_spec_tensor, audio_spec_path)
return audio_norm_path, audio_spec_path

View File

@@ -0,0 +1,54 @@
from typing import Optional, Tuple
import numpy as np
from .vad import SileroVoiceActivityDetector
def trim_silence(
audio_array: np.ndarray,
detector: SileroVoiceActivityDetector,
threshold: float = 0.2,
samples_per_chunk=480,
sample_rate=16000,
keep_chunks_before: int = 2,
keep_chunks_after: int = 2,
) -> Tuple[float, Optional[float]]:
"""Returns the offset/duration of trimmed audio in seconds"""
offset_sec: float = 0.0
duration_sec: Optional[float] = None
first_chunk: Optional[int] = None
last_chunk: Optional[int] = None
seconds_per_chunk: float = samples_per_chunk / sample_rate
chunk = audio_array[:samples_per_chunk]
audio_array = audio_array[samples_per_chunk:]
chunk_idx: int = 0
# Determine main block of speech
while len(audio_array) > 0:
prob = detector(chunk, sample_rate=sample_rate)
is_speech = prob >= threshold
if is_speech:
if first_chunk is None:
# First speech
first_chunk = chunk_idx
else:
# Last speech so far
last_chunk = chunk_idx
chunk = audio_array[:samples_per_chunk]
audio_array = audio_array[samples_per_chunk:]
chunk_idx += 1
if (first_chunk is not None) and (last_chunk is not None):
first_chunk = max(0, first_chunk - keep_chunks_before)
last_chunk = min(chunk_idx, last_chunk + keep_chunks_after)
# Compute offset/duration
offset_sec = first_chunk * seconds_per_chunk
last_sec = (last_chunk + 1) * seconds_per_chunk
duration_sec = last_sec - offset_sec
return offset_sec, duration_sec

View File

@@ -0,0 +1,54 @@
import typing
from pathlib import Path
import numpy as np
import onnxruntime
class SileroVoiceActivityDetector:
"""Detects speech/silence using Silero VAD.
https://github.com/snakers4/silero-vad
"""
def __init__(self, onnx_path: typing.Union[str, Path]):
onnx_path = str(onnx_path)
self.session = onnxruntime.InferenceSession(onnx_path)
self.session.intra_op_num_threads = 1
self.session.inter_op_num_threads = 1
self._h = np.zeros((2, 1, 64)).astype("float32")
self._c = np.zeros((2, 1, 64)).astype("float32")
def __call__(self, audio_array: np.ndarray, sample_rate: int = 16000):
"""Return probability of speech in audio [0-1].
Audio must be 16Khz 16-bit mono PCM.
"""
if len(audio_array.shape) == 1:
# Add batch dimension
audio_array = np.expand_dims(audio_array, 0)
if len(audio_array.shape) > 2:
raise ValueError(
f"Too many dimensions for input audio chunk {audio_array.shape}"
)
if audio_array.shape[0] > 1:
raise ValueError("Onnx model does not support batching")
if sample_rate != 16000:
raise ValueError("Only 16Khz audio is supported")
ort_inputs = {
"input": audio_array.astype(np.float32),
"h0": self._h,
"c0": self._c,
}
ort_outs = self.session.run(None, ort_inputs)
out, self._h, self._c = ort_outs
out = out.squeeze(2)[:, 1] # make output type match JIT analog
return out

View File

@@ -0,0 +1,502 @@
#!/usr/bin/env python3
import argparse
import csv
import dataclasses
import itertools
import json
import logging
import os
import unicodedata
from collections import Counter
from dataclasses import dataclass, field
from enum import Enum
from multiprocessing import JoinableQueue, Process, Queue
from pathlib import Path
from typing import Dict, Iterable, List, Optional
from piper_phonemize import (
phonemize_espeak,
phonemize_codepoints,
phoneme_ids_espeak,
phoneme_ids_codepoints,
get_codepoints_map,
get_espeak_map,
get_max_phonemes,
tashkeel_run,
)
from .norm_audio import cache_norm_audio, make_silence_detector
_DIR = Path(__file__).parent
_VERSION = (_DIR / "VERSION").read_text(encoding="utf-8").strip()
_LOGGER = logging.getLogger("preprocess")
class PhonemeType(str, Enum):
ESPEAK = "espeak"
"""Phonemes come from espeak-ng"""
TEXT = "text"
"""Phonemes come from text itself"""
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"--input-dir", required=True, help="Directory with audio dataset"
)
parser.add_argument(
"--output-dir",
required=True,
help="Directory to write output files for training",
)
parser.add_argument("--language", required=True, help="eSpeak-ng voice")
parser.add_argument(
"--sample-rate",
type=int,
required=True,
help="Target sample rate for voice (hertz)",
)
parser.add_argument(
"--dataset-format", choices=("ljspeech", "mycroft"), required=True
)
parser.add_argument("--cache-dir", help="Directory to cache processed audio files")
parser.add_argument("--max-workers", type=int)
parser.add_argument(
"--single-speaker", action="store_true", help="Force single speaker dataset"
)
parser.add_argument(
"--speaker-id", type=int, help="Add speaker id to single speaker dataset"
)
#
parser.add_argument(
"--phoneme-type",
choices=list(PhonemeType),
default=PhonemeType.ESPEAK,
help="Type of phonemes to use (default: espeak)",
)
parser.add_argument(
"--text-casing",
choices=("ignore", "lower", "upper", "casefold"),
default="ignore",
help="Casing applied to utterance text",
)
#
parser.add_argument(
"--dataset-name",
help="Name of dataset to put in config (default: name of <ouput_dir>/../)",
)
parser.add_argument(
"--audio-quality",
help="Audio quality to put in config (default: name of <output_dir>)",
)
#
parser.add_argument(
"--tashkeel",
action="store_true",
help="Diacritize Arabic text with libtashkeel",
)
#
parser.add_argument(
"--skip-audio", action="store_true", help="Don't preprocess audio"
)
parser.add_argument(
"--debug", action="store_true", help="Print DEBUG messages to the console"
)
args = parser.parse_args()
if args.single_speaker and (args.speaker_id is not None):
_LOGGER.fatal("--single-speaker and --speaker-id cannot both be provided")
return
level = logging.DEBUG if args.debug else logging.INFO
logging.basicConfig(level=level)
logging.getLogger().setLevel(level)
# Prevent log spam
logging.getLogger("numba").setLevel(logging.WARNING)
# Ensure enum
args.phoneme_type = PhonemeType(args.phoneme_type)
# Convert to paths and create output directories
args.input_dir = Path(args.input_dir)
args.output_dir = Path(args.output_dir)
args.output_dir.mkdir(parents=True, exist_ok=True)
args.cache_dir = (
Path(args.cache_dir)
if args.cache_dir
else args.output_dir / "cache" / str(args.sample_rate)
)
args.cache_dir.mkdir(parents=True, exist_ok=True)
if args.dataset_format == "mycroft":
make_dataset = mycroft_dataset
else:
make_dataset = ljspeech_dataset
# Count speakers
_LOGGER.debug("Counting number of speakers/utterances in the dataset")
speaker_counts: "Counter[str]" = Counter()
num_utterances = 0
for utt in make_dataset(args):
speaker = utt.speaker or ""
speaker_counts[speaker] += 1
num_utterances += 1
assert num_utterances > 0, "No utterances found"
is_multispeaker = len(speaker_counts) > 1
speaker_ids: Dict[str, int] = {}
if is_multispeaker:
_LOGGER.info("%s speakers detected", len(speaker_counts))
# Assign speaker ids by most number of utterances first
for speaker_id, (speaker, _speaker_count) in enumerate(
speaker_counts.most_common()
):
speaker_ids[speaker] = speaker_id
else:
_LOGGER.info("Single speaker dataset")
# Write config
audio_quality = args.audio_quality or args.output_dir.name
dataset_name = args.dataset_name or args.output_dir.parent.name
with open(args.output_dir / "config.json", "w", encoding="utf-8") as config_file:
json.dump(
{
"dataset": dataset_name,
"audio": {
"sample_rate": args.sample_rate,
"quality": audio_quality,
},
"espeak": {
"voice": args.language,
},
"language": {
"code": args.language,
},
"inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
"phoneme_type": args.phoneme_type.value,
"phoneme_map": {},
"phoneme_id_map": get_codepoints_map()[args.language]
if args.phoneme_type == PhonemeType.TEXT
else get_espeak_map(),
"num_symbols": get_max_phonemes(),
"num_speakers": len(speaker_counts),
"speaker_id_map": speaker_ids,
"piper_version": _VERSION,
},
config_file,
ensure_ascii=False,
indent=4,
)
_LOGGER.info("Wrote dataset config")
if (args.max_workers is None) or (args.max_workers < 1):
args.max_workers = os.cpu_count()
assert args.max_workers is not None
batch_size = int(num_utterances / (args.max_workers * 2))
queue_in: "Queue[Iterable[Utterance]]" = JoinableQueue()
queue_out: "Queue[Optional[Utterance]]" = Queue()
# Start workers
if args.phoneme_type == PhonemeType.TEXT:
target = phonemize_batch_text
else:
target = phonemize_batch_espeak
processes = [
Process(target=target, args=(args, queue_in, queue_out))
for _ in range(args.max_workers)
]
for proc in processes:
proc.start()
_LOGGER.info(
"Processing %s utterance(s) with %s worker(s)", num_utterances, args.max_workers
)
with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
for utt_batch in batched(
make_dataset(args),
batch_size,
):
queue_in.put(utt_batch)
_LOGGER.debug("Waiting for jobs to finish")
missing_phonemes: "Counter[str]" = Counter()
for _ in range(num_utterances):
utt = queue_out.get()
if utt is not None:
if utt.speaker is not None:
utt.speaker_id = speaker_ids[utt.speaker]
utt_dict = dataclasses.asdict(utt)
utt_dict.pop("missing_phonemes")
# JSONL
json.dump(
utt_dict,
dataset_file,
ensure_ascii=False,
cls=PathEncoder,
)
print("", file=dataset_file)
missing_phonemes.update(utt.missing_phonemes)
if missing_phonemes:
for phoneme, count in missing_phonemes.most_common():
_LOGGER.warning("Missing %s (%s)", phoneme, count)
_LOGGER.warning("Missing %s phoneme(s)", len(missing_phonemes))
# Signal workers to stop
for proc in processes:
queue_in.put(None)
# Wait for workers to stop
for proc in processes:
proc.join(timeout=1)
# -----------------------------------------------------------------------------
def get_text_casing(casing: str):
if casing == "lower":
return str.lower
if casing == "upper":
return str.upper
if casing == "casefold":
return str.casefold
return lambda s: s
def phonemize_batch_espeak(
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
):
try:
casing = get_text_casing(args.text_casing)
silence_detector = make_silence_detector()
while True:
utt_batch = queue_in.get()
if utt_batch is None:
break
for utt in utt_batch:
try:
if args.tashkeel:
utt.text = tashkeel_run(utt.text)
_LOGGER.debug(utt)
all_phonemes = phonemize_espeak(casing(utt.text), args.language)
# Flatten
utt.phonemes = [
phoneme
for sentence_phonemes in all_phonemes
for phoneme in sentence_phonemes
]
utt.phoneme_ids = phoneme_ids_espeak(
utt.phonemes,
missing_phonemes=utt.missing_phonemes,
)
if not args.skip_audio:
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
utt.audio_path,
args.cache_dir,
silence_detector,
args.sample_rate,
)
queue_out.put(utt)
except TimeoutError:
_LOGGER.error("Skipping utterance due to timeout: %s", utt)
except Exception:
_LOGGER.exception("Failed to process utterance: %s", utt)
queue_out.put(None)
queue_in.task_done()
except Exception:
_LOGGER.exception("phonemize_batch_espeak")
def phonemize_batch_text(
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
):
try:
casing = get_text_casing(args.text_casing)
silence_detector = make_silence_detector()
while True:
utt_batch = queue_in.get()
if utt_batch is None:
break
for utt in utt_batch:
try:
if args.tashkeel:
utt.text = tashkeel_run(utt.text)
_LOGGER.debug(utt)
all_phonemes = phonemize_codepoints(casing(utt.text))
# Flatten
utt.phonemes = [
phoneme
for sentence_phonemes in all_phonemes
for phoneme in sentence_phonemes
]
utt.phoneme_ids = phoneme_ids_codepoints(
args.language,
utt.phonemes,
missing_phonemes=utt.missing_phonemes,
)
if not args.skip_audio:
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
utt.audio_path,
args.cache_dir,
silence_detector,
args.sample_rate,
)
queue_out.put(utt)
except TimeoutError:
_LOGGER.error("Skipping utterance due to timeout: %s", utt)
except Exception:
_LOGGER.exception("Failed to process utterance: %s", utt)
queue_out.put(None)
queue_in.task_done()
except Exception:
_LOGGER.exception("phonemize_batch_text")
# -----------------------------------------------------------------------------
@dataclass
class Utterance:
text: str
audio_path: Path
speaker: Optional[str] = None
speaker_id: Optional[int] = None
phonemes: Optional[List[str]] = None
phoneme_ids: Optional[List[int]] = None
audio_norm_path: Optional[Path] = None
audio_spec_path: Optional[Path] = None
missing_phonemes: "Counter[str]" = field(default_factory=Counter)
class PathEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, Path):
return str(o)
return super().default(o)
def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
dataset_dir = args.input_dir
is_single_speaker = args.single_speaker
speaker_id = args.speaker_id
skip_audio = args.skip_audio
# filename|speaker|text
# speaker is optional
metadata_path = dataset_dir / "metadata.csv"
assert metadata_path.exists(), f"Missing {metadata_path}"
wav_dir = dataset_dir / "wav"
if not wav_dir.is_dir():
wav_dir = dataset_dir / "wavs"
with open(metadata_path, "r", encoding="utf-8") as csv_file:
reader = csv.reader(csv_file, delimiter="|")
for row in reader:
assert len(row) >= 2, "Not enough columns"
speaker: Optional[str] = None
if is_single_speaker or (len(row) == 2):
filename, text = row[0], row[-1]
else:
filename, speaker, text = row[0], row[1], row[-1]
# Try file name relative to metadata
wav_path = metadata_path.parent / filename
if not wav_path.exists():
# Try with .wav
wav_path = metadata_path.parent / f"{filename}.wav"
if not wav_path.exists():
# Try wav/ or wavs/
wav_path = wav_dir / filename
if not wav_path.exists():
# Try with .wav
wav_path = wav_dir / f"{filename}.wav"
if not skip_audio:
if not wav_path.exists():
_LOGGER.warning("Missing %s", filename)
continue
if wav_path.stat().st_size == 0:
_LOGGER.warning("Empty file: %s", wav_path)
continue
yield Utterance(
text=text, audio_path=wav_path, speaker=speaker, speaker_id=speaker_id
)
def mycroft_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
dataset_dir = args.input_dir
is_single_speaker = args.single_speaker
skip_audio = args.skip_audio
speaker_id = 0
for metadata_path in dataset_dir.glob("**/*-metadata.txt"):
speaker = metadata_path.parent.name if not is_single_speaker else None
with open(metadata_path, "r", encoding="utf-8") as csv_file:
# filename|text|length
reader = csv.reader(csv_file, delimiter="|")
for row in reader:
filename, text = row[0], row[1]
wav_path = metadata_path.parent / filename
if skip_audio or (wav_path.exists() and (wav_path.stat().st_size > 0)):
yield Utterance(
text=text,
audio_path=wav_path,
speaker=speaker,
speaker_id=speaker_id if not is_single_speaker else None,
)
speaker_id += 1
# -----------------------------------------------------------------------------
def batched(iterable, n):
"Batch data into lists of length n. The last batch may be shorter."
# batched('ABCDEFG', 3) --> ABC DEF G
if n < 1:
raise ValueError("n must be at least one")
it = iter(iterable)
batch = list(itertools.islice(it, n))
while batch:
yield batch
batch = list(itertools.islice(it, n))
# -----------------------------------------------------------------------------
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,40 @@
[MESSAGES CONTROL]
disable=
format,
abstract-class-little-used,
abstract-method,
cyclic-import,
duplicate-code,
global-statement,
import-outside-toplevel,
inconsistent-return-statements,
locally-disabled,
not-context-manager,
redefined-variable-type,
too-few-public-methods,
too-many-arguments,
too-many-branches,
too-many-instance-attributes,
too-many-lines,
too-many-locals,
too-many-public-methods,
too-many-return-statements,
too-many-statements,
too-many-boolean-expressions,
unnecessary-pass,
unused-argument,
broad-except,
too-many-nested-blocks,
invalid-name,
unused-import,
no-self-use,
fixme,
useless-super-delegation,
missing-module-docstring,
missing-class-docstring,
missing-function-docstring,
import-error,
relative-beyond-top-level
[FORMAT]
expected-line-ending-format=LF

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env python3
import argparse
import csv
import sys
from collections import Counter, defaultdict
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--speaker-number", type=int)
parser.add_argument("--speaker-name")
args = parser.parse_args()
assert (args.speaker_number is not None) or (args.speaker_name is not None)
reader = csv.reader(sys.stdin, delimiter="|")
writer = csv.writer(sys.stdout, delimiter="|")
if args.speaker_name is not None:
for row in reader:
audio, speaker_id, text = row[0], row[1], row[-1]
if args.speaker_name == speaker_id:
writer.writerow((audio, text))
else:
utterances = defaultdict(list)
counts = Counter()
for row in reader:
audio, speaker_id, text = row[0], row[1], row[-1]
utterances[speaker_id].append((audio, text))
counts[speaker_id] += 1
writer = csv.writer(sys.stdout, delimiter="|")
for i, (speaker_id, _count) in enumerate(counts.most_common()):
if i == args.speaker_number:
for row in utterances[speaker_id]:
writer.writerow(row)
print(speaker_id, file=sys.stderr)
break
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,22 @@
[flake8]
# To work with Black
max-line-length = 88
# E501: line too long
# W503: Line break occurred before a binary operator
# E203: Whitespace before ':'
# D202 No blank lines allowed after function docstring
# W504 line break after binary operator
ignore =
E501,
W503,
E203,
D202,
W504
[isort]
multi_line_output = 3
include_trailing_comma=True
force_grid_wrap=0
use_parentheses=True
line_length=88
indent = " "

View File

@@ -0,0 +1,427 @@
import math
import typing
import torch
from torch import nn
from torch.nn import functional as F
from .commons import subsequent_mask
from .modules import LayerNorm
class Encoder(nn.Module):
def __init__(
self,
hidden_channels: int,
filter_channels: int,
n_heads: int,
n_layers: int,
kernel_size: int = 1,
p_dropout: float = 0.0,
window_size: int = 4,
**kwargs
):
super().__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.window_size = window_size
self.drop = nn.Dropout(p_dropout)
self.attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.attn_layers.append(
MultiHeadAttention(
hidden_channels,
hidden_channels,
n_heads,
p_dropout=p_dropout,
window_size=window_size,
)
)
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(
FFN(
hidden_channels,
hidden_channels,
filter_channels,
kernel_size,
p_dropout=p_dropout,
)
)
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask):
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
for attn_layer, norm_layer_1, ffn_layer, norm_layer_2 in zip(
self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2
):
y = attn_layer(x, x, attn_mask)
y = self.drop(y)
x = norm_layer_1(x + y)
y = ffn_layer(x, x_mask)
y = self.drop(y)
x = norm_layer_2(x + y)
x = x * x_mask
return x
class Decoder(nn.Module):
def __init__(
self,
hidden_channels: int,
filter_channels: int,
n_heads: int,
n_layers: int,
kernel_size: int = 1,
p_dropout: float = 0.0,
proximal_bias: bool = False,
proximal_init: bool = True,
**kwargs
):
super().__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.proximal_bias = proximal_bias
self.proximal_init = proximal_init
self.drop = nn.Dropout(p_dropout)
self.self_attn_layers = nn.ModuleList()
self.norm_layers_0 = nn.ModuleList()
self.encdec_attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.self_attn_layers.append(
MultiHeadAttention(
hidden_channels,
hidden_channels,
n_heads,
p_dropout=p_dropout,
proximal_bias=proximal_bias,
proximal_init=proximal_init,
)
)
self.norm_layers_0.append(LayerNorm(hidden_channels))
self.encdec_attn_layers.append(
MultiHeadAttention(
hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
)
)
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(
FFN(
hidden_channels,
hidden_channels,
filter_channels,
kernel_size,
p_dropout=p_dropout,
causal=True,
)
)
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask, h, h_mask):
"""
x: decoder input
h: encoder output
"""
self_attn_mask = subsequent_mask(x_mask.size(2)).type_as(x)
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
for i in range(self.n_layers):
y = self.self_attn_layers[i](x, x, self_attn_mask)
y = self.drop(y)
x = self.norm_layers_0[i](x + y)
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
y = self.drop(y)
x = self.norm_layers_1[i](x + y)
y = self.ffn_layers[i](x, x_mask)
y = self.drop(y)
x = self.norm_layers_2[i](x + y)
x = x * x_mask
return x
class MultiHeadAttention(nn.Module):
def __init__(
self,
channels: int,
out_channels: int,
n_heads: int,
p_dropout: float = 0.0,
window_size: typing.Optional[int] = None,
heads_share: bool = True,
block_length: typing.Optional[int] = None,
proximal_bias: bool = False,
proximal_init: bool = False,
):
super().__init__()
assert channels % n_heads == 0
self.channels = channels
self.out_channels = out_channels
self.n_heads = n_heads
self.p_dropout = p_dropout
self.window_size = window_size
self.heads_share = heads_share
self.block_length = block_length
self.proximal_bias = proximal_bias
self.proximal_init = proximal_init
self.attn = torch.zeros(1)
self.k_channels = channels // n_heads
self.conv_q = nn.Conv1d(channels, channels, 1)
self.conv_k = nn.Conv1d(channels, channels, 1)
self.conv_v = nn.Conv1d(channels, channels, 1)
self.conv_o = nn.Conv1d(channels, out_channels, 1)
self.drop = nn.Dropout(p_dropout)
if window_size is not None:
n_heads_rel = 1 if heads_share else n_heads
rel_stddev = self.k_channels**-0.5
self.emb_rel_k = nn.Parameter(
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
* rel_stddev
)
self.emb_rel_v = nn.Parameter(
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
* rel_stddev
)
nn.init.xavier_uniform_(self.conv_q.weight)
nn.init.xavier_uniform_(self.conv_k.weight)
nn.init.xavier_uniform_(self.conv_v.weight)
if proximal_init:
with torch.no_grad():
self.conv_k.weight.copy_(self.conv_q.weight)
self.conv_k.bias.copy_(self.conv_q.bias)
def forward(self, x, c, attn_mask=None):
q = self.conv_q(x)
k = self.conv_k(c)
v = self.conv_v(c)
x, self.attn = self.attention(q, k, v, mask=attn_mask)
x = self.conv_o(x)
return x
def attention(self, query, key, value, mask=None):
# reshape [b, d, t] -> [b, n_h, t, d_k]
b, d, t_s, t_t = (key.size(0), key.size(1), key.size(2), query.size(2))
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
if self.window_size is not None:
assert (
t_s == t_t
), "Relative attention is only available for self-attention."
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
rel_logits = self._matmul_with_relative_keys(
query / math.sqrt(self.k_channels), key_relative_embeddings
)
scores_local = self._relative_position_to_absolute_position(rel_logits)
scores = scores + scores_local
if self.proximal_bias:
assert t_s == t_t, "Proximal bias is only available for self-attention."
scores = scores + self._attention_bias_proximal(t_s).type_as(scores)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e4)
if self.block_length is not None:
assert (
t_s == t_t
), "Local attention is only available for self-attention."
block_mask = (
torch.ones_like(scores)
.triu(-self.block_length)
.tril(self.block_length)
)
scores = scores.masked_fill(block_mask == 0, -1e4)
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
p_attn = self.drop(p_attn)
output = torch.matmul(p_attn, value)
if self.window_size is not None:
relative_weights = self._absolute_position_to_relative_position(p_attn)
value_relative_embeddings = self._get_relative_embeddings(
self.emb_rel_v, t_s
)
output = output + self._matmul_with_relative_values(
relative_weights, value_relative_embeddings
)
output = (
output.transpose(2, 3).contiguous().view(b, d, t_t)
) # [b, n_h, t_t, d_k] -> [b, d, t_t]
return output, p_attn
def _matmul_with_relative_values(self, x, y):
"""
x: [b, h, l, m]
y: [h or 1, m, d]
ret: [b, h, l, d]
"""
ret = torch.matmul(x, y.unsqueeze(0))
return ret
def _matmul_with_relative_keys(self, x, y):
"""
x: [b, h, l, d]
y: [h or 1, m, d]
ret: [b, h, l, m]
"""
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
return ret
def _get_relative_embeddings(self, relative_embeddings, length: int):
# max_relative_position = 2 * self.window_size + 1
# Pad first before slice to avoid using cond ops.
pad_length = max(length - (self.window_size + 1), 0)
slice_start_position = max((self.window_size + 1) - length, 0)
slice_end_position = slice_start_position + 2 * length - 1
if pad_length > 0:
padded_relative_embeddings = F.pad(
relative_embeddings,
# convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
(0, 0, pad_length, pad_length, 0, 0),
)
else:
padded_relative_embeddings = relative_embeddings
used_relative_embeddings = padded_relative_embeddings[
:, slice_start_position:slice_end_position
]
return used_relative_embeddings
def _relative_position_to_absolute_position(self, x):
"""
x: [b, h, l, 2*l-1]
ret: [b, h, l, l]
"""
batch, heads, length, _ = x.size()
# Concat columns of pad to shift from relative to absolute indexing.
# x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
x = F.pad(x, (0, 1, 0, 0, 0, 0, 0, 0))
# Concat extra elements so to add up to shape (len+1, 2*len-1).
x_flat = x.view([batch, heads, length * 2 * length])
# x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))
x_flat = F.pad(x_flat, (0, length - 1, 0, 0, 0, 0))
# Reshape and slice out the padded elements.
x_final = x_flat.view([batch, heads, length + 1, (2 * length) - 1])[
:, :, :length, length - 1 :
]
return x_final
def _absolute_position_to_relative_position(self, x):
"""
x: [b, h, l, l]
ret: [b, h, l, 2*l-1]
"""
batch, heads, length, _ = x.size()
# padd along column
# x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
x = F.pad(x, (0, length - 1, 0, 0, 0, 0, 0, 0))
x_flat = x.view([batch, heads, (length * length) + (length * (length - 1))])
# add 0's in the beginning that will skew the elements after reshape
# x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
x_flat = F.pad(x_flat, (length, 0, 0, 0, 0, 0))
x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
return x_final
def _attention_bias_proximal(self, length: int):
"""Bias for self-attention to encourage attention to close positions.
Args:
length: an integer scalar.
Returns:
a Tensor with shape [1, 1, length, length]
"""
r = torch.arange(length, dtype=torch.float32)
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
class FFN(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
filter_channels: int,
kernel_size: int,
p_dropout: float = 0.0,
activation: str = "",
causal: bool = False,
):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.activation = activation
self.causal = causal
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
self.drop = nn.Dropout(p_dropout)
def forward(self, x, x_mask):
if self.causal:
padding1 = self._causal_padding(x * x_mask)
else:
padding1 = self._same_padding(x * x_mask)
x = self.conv_1(padding1)
if self.activation == "gelu":
x = x * torch.sigmoid(1.702 * x)
else:
x = torch.relu(x)
x = self.drop(x)
if self.causal:
padding2 = self._causal_padding(x * x_mask)
else:
padding2 = self._same_padding(x * x_mask)
x = self.conv_2(padding2)
return x * x_mask
def _causal_padding(self, x):
if self.kernel_size == 1:
return x
pad_l = self.kernel_size - 1
pad_r = 0
# padding = [[0, 0], [0, 0], [pad_l, pad_r]]
# x = F.pad(x, convert_pad_shape(padding))
x = F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
return x
def _same_padding(self, x):
if self.kernel_size == 1:
return x
pad_l = (self.kernel_size - 1) // 2
pad_r = self.kernel_size // 2
# padding = [[0, 0], [0, 0], [pad_l, pad_r]]
# x = F.pad(x, convert_pad_shape(padding))
x = F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
return x

View File

@@ -0,0 +1,147 @@
import logging
import math
from typing import Optional
import torch
from torch.nn import functional as F
_LOGGER = logging.getLogger("vits.commons")
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
m.weight.data.normal_(mean, std)
def get_padding(kernel_size, dilation=1):
return int((kernel_size * dilation - dilation) / 2)
def intersperse(lst, item):
result = [item] * (len(lst) * 2 + 1)
result[1::2] = lst
return result
def kl_divergence(m_p, logs_p, m_q, logs_q):
"""KL(P||Q)"""
kl = (logs_q - logs_p) - 0.5
kl += (
0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
)
return kl
def rand_gumbel(shape):
"""Sample from the Gumbel distribution, protect from overflows."""
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
return -torch.log(-torch.log(uniform_samples))
def rand_gumbel_like(x):
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
return g
def slice_segments(x, ids_str, segment_size=4):
ret = torch.zeros_like(x[:, :, :segment_size])
for i in range(x.size(0)):
idx_str = max(0, ids_str[i])
idx_end = idx_str + segment_size
ret[i] = x[i, :, idx_str:idx_end]
return ret
def rand_slice_segments(x, x_lengths=None, segment_size=4):
b, d, t = x.size()
if x_lengths is None:
x_lengths = t
ids_str_max = x_lengths - segment_size + 1
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
ret = slice_segments(x, ids_str, segment_size)
return ret, ids_str
def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
position = torch.arange(length, dtype=torch.float)
num_timescales = channels // 2
log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
num_timescales - 1
)
inv_timescales = min_timescale * torch.exp(
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
)
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
signal = F.pad(signal, [0, 0, 0, channels % 2])
signal = signal.view(1, channels, length)
return signal
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
b, channels, length = x.size()
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
return x + signal.to(dtype=x.dtype, device=x.device)
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
b, channels, length = x.size()
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
def subsequent_mask(length: int):
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
return mask
@torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
n_channels_int = n_channels[0]
in_act = input_a + input_b
t_act = torch.tanh(in_act[:, :n_channels_int, :])
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
acts = t_act * s_act
return acts
def sequence_mask(length, max_length: Optional[int] = None):
if max_length is None:
max_length = length.max()
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
return x.unsqueeze(0) < length.unsqueeze(1)
def generate_path(duration, mask):
"""
duration: [b, 1, t_x]
mask: [b, 1, t_y, t_x]
"""
b, _, t_y, t_x = mask.shape
cum_duration = torch.cumsum(duration, -1)
cum_duration_flat = cum_duration.view(b * t_x)
path = sequence_mask(cum_duration_flat, t_y).type_as(mask)
path = path.view(b, t_x, t_y)
path = path - F.pad(path, (0, 0, 1, 0, 0, 0))[:, :-1]
path = path.unsqueeze(1).transpose(2, 3) * mask
return path
def clip_grad_value_(parameters, clip_value, norm_type=2):
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
parameters = list(filter(lambda p: p.grad is not None, parameters))
norm_type = float(norm_type)
if clip_value is not None:
clip_value = float(clip_value)
total_norm = 0
for p in parameters:
param_norm = p.grad.data.norm(norm_type)
total_norm += param_norm.item() ** norm_type
if clip_value is not None:
p.grad.data.clamp_(min=-clip_value, max=clip_value)
total_norm = total_norm ** (1.0 / norm_type)
return total_norm

View File

@@ -0,0 +1,330 @@
"""Configuration classes"""
from dataclasses import dataclass, field
from typing import Optional, Tuple
@dataclass
class MelAudioConfig:
filter_length: int = 1024
hop_length: int = 256
win_length: int = 1024
mel_channels: int = 80
sample_rate: int = 22050
sample_bytes: int = 2
channels: int = 1
mel_fmin: float = 0.0
mel_fmax: Optional[float] = None
@dataclass
class ModelAudioConfig:
resblock: str
resblock_kernel_sizes: Tuple[int, ...]
resblock_dilation_sizes: Tuple[Tuple[int, ...], ...]
upsample_rates: Tuple[int, ...]
upsample_initial_channel: int
upsample_kernel_sizes: Tuple[int, ...]
@staticmethod
def low_quality() -> "ModelAudioConfig":
return ModelAudioConfig(
resblock="2",
resblock_kernel_sizes=(3, 5, 7),
resblock_dilation_sizes=(
(1, 2),
(2, 6),
(3, 12),
),
upsample_rates=(8, 8, 4),
upsample_initial_channel=256,
upsample_kernel_sizes=(16, 16, 8),
)
@staticmethod
def high_quality() -> "ModelAudioConfig":
return ModelAudioConfig(
resblock="1",
resblock_kernel_sizes=(3, 7, 11),
resblock_dilation_sizes=(
(1, 3, 5),
(1, 3, 5),
(1, 3, 5),
),
upsample_rates=(8, 8, 2, 2),
upsample_initial_channel=512,
upsample_kernel_sizes=(16, 16, 4, 4),
)
@dataclass
class ModelConfig:
num_symbols: int
n_speakers: int
audio: ModelAudioConfig
mel: MelAudioConfig = field(default_factory=MelAudioConfig)
inter_channels: int = 192
hidden_channels: int = 192
filter_channels: int = 768
n_heads: int = 2
n_layers: int = 6
kernel_size: int = 3
p_dropout: float = 0.1
n_layers_q: int = 3
use_spectral_norm: bool = False
gin_channels: int = 0 # single speaker
use_sdp: bool = True # StochasticDurationPredictor
segment_size: int = 8192
@property
def is_multispeaker(self) -> bool:
return self.n_speakers > 1
@property
def resblock(self) -> str:
return self.audio.resblock
@property
def resblock_kernel_sizes(self) -> Tuple[int, ...]:
return self.audio.resblock_kernel_sizes
@property
def resblock_dilation_sizes(self) -> Tuple[Tuple[int, ...], ...]:
return self.audio.resblock_dilation_sizes
@property
def upsample_rates(self) -> Tuple[int, ...]:
return self.audio.upsample_rates
@property
def upsample_initial_channel(self) -> int:
return self.audio.upsample_initial_channel
@property
def upsample_kernel_sizes(self) -> Tuple[int, ...]:
return self.audio.upsample_kernel_sizes
def __post_init__(self):
if self.is_multispeaker and (self.gin_channels == 0):
self.gin_channels = 512
@dataclass
class TrainingConfig:
learning_rate: float = 2e-4
betas: Tuple[float, float] = field(default=(0.8, 0.99))
eps: float = 1e-9
# batch_size: int = 32
fp16_run: bool = False
lr_decay: float = 0.999875
init_lr_ratio: float = 1.0
warmup_epochs: int = 0
c_mel: int = 45
c_kl: float = 1.0
grad_clip: Optional[float] = None
# @dataclass
# class PhonemesConfig(DataClassJsonMixin):
# phoneme_separator: str = " "
# """Separator between individual phonemes in CSV input"""
# word_separator: str = "#"
# """Separator between word phonemes in CSV input (must not match phoneme_separator)"""
# phoneme_to_id: typing.Optional[typing.Dict[str, int]] = None
# pad: typing.Optional[str] = "_"
# bos: typing.Optional[str] = None
# eos: typing.Optional[str] = None
# blank: typing.Optional[str] = "#"
# blank_word: typing.Optional[str] = None
# blank_between: typing.Union[str, BlankBetween] = BlankBetween.WORDS
# blank_at_start: bool = True
# blank_at_end: bool = True
# simple_punctuation: bool = True
# punctuation_map: typing.Optional[typing.Dict[str, str]] = None
# separate: typing.Optional[typing.List[str]] = None
# separate_graphemes: bool = False
# separate_tones: bool = False
# tone_before: bool = False
# phoneme_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None
# auto_bos_eos: bool = False
# minor_break: typing.Optional[str] = IPA.BREAK_MINOR.value
# major_break: typing.Optional[str] = IPA.BREAK_MAJOR.value
# break_phonemes_into_graphemes: bool = False
# break_phonemes_into_codepoints: bool = False
# drop_stress: bool = False
# symbols: typing.Optional[typing.List[str]] = None
# def split_word_phonemes(self, phonemes_str: str) -> typing.List[typing.List[str]]:
# """Split phonemes string into a list of lists (outer is words, inner is individual phonemes in each word)"""
# return [
# word_phonemes_str.split(self.phoneme_separator)
# if self.phoneme_separator
# else list(word_phonemes_str)
# for word_phonemes_str in phonemes_str.split(self.word_separator)
# ]
# def join_word_phonemes(self, word_phonemes: typing.List[typing.List[str]]) -> str:
# """Split phonemes string into a list of lists (outer is words, inner is individual phonemes in each word)"""
# return self.word_separator.join(
# self.phoneme_separator.join(wp) for wp in word_phonemes
# )
# class Phonemizer(str, Enum):
# SYMBOLS = "symbols"
# GRUUT = "gruut"
# ESPEAK = "espeak"
# EPITRAN = "epitran"
# class Aligner(str, Enum):
# KALDI_ALIGN = "kaldi_align"
# class TextCasing(str, Enum):
# LOWER = "lower"
# UPPER = "upper"
# class MetadataFormat(str, Enum):
# TEXT = "text"
# PHONEMES = "phonemes"
# PHONEME_IDS = "ids"
# @dataclass
# class DatasetConfig:
# name: str
# metadata_format: MetadataFormat = MetadataFormat.TEXT
# multispeaker: bool = False
# text_language: typing.Optional[str] = None
# audio_dir: typing.Optional[typing.Union[str, Path]] = None
# cache_dir: typing.Optional[typing.Union[str, Path]] = None
# def get_cache_dir(self, output_dir: typing.Union[str, Path]) -> Path:
# if self.cache_dir is not None:
# cache_dir = Path(self.cache_dir)
# else:
# cache_dir = Path("cache") / self.name
# if not cache_dir.is_absolute():
# cache_dir = Path(output_dir) / str(cache_dir)
# return cache_dir
# @dataclass
# class AlignerConfig:
# aligner: typing.Optional[Aligner] = None
# casing: typing.Optional[TextCasing] = None
# @dataclass
# class InferenceConfig:
# length_scale: float = 1.0
# noise_scale: float = 0.667
# noise_w: float = 0.8
# @dataclass
# class TrainingConfig(DataClassJsonMixin):
# seed: int = 1234
# epochs: int = 10000
# learning_rate: float = 2e-4
# betas: typing.Tuple[float, float] = field(default=(0.8, 0.99))
# eps: float = 1e-9
# batch_size: int = 32
# fp16_run: bool = False
# lr_decay: float = 0.999875
# segment_size: int = 8192
# init_lr_ratio: float = 1.0
# warmup_epochs: int = 0
# c_mel: int = 45
# c_kl: float = 1.0
# grad_clip: typing.Optional[float] = None
# min_seq_length: typing.Optional[int] = None
# max_seq_length: typing.Optional[int] = None
# min_spec_length: typing.Optional[int] = None
# max_spec_length: typing.Optional[int] = None
# min_speaker_utterances: typing.Optional[int] = None
# last_epoch: int = 1
# global_step: int = 1
# best_loss: typing.Optional[float] = None
# audio: AudioConfig = field(default_factory=AudioConfig)
# model: ModelConfig = field(default_factory=ModelConfig)
# phonemes: PhonemesConfig = field(default_factory=PhonemesConfig)
# text_aligner: AlignerConfig = field(default_factory=AlignerConfig)
# text_language: typing.Optional[str] = None
# phonemizer: typing.Optional[Phonemizer] = None
# datasets: typing.List[DatasetConfig] = field(default_factory=list)
# inference: InferenceConfig = field(default_factory=InferenceConfig)
# version: int = 1
# git_commit: str = ""
# @property
# def is_multispeaker(self):
# return self.model.is_multispeaker or any(d.multispeaker for d in self.datasets)
# def save(self, config_file: typing.TextIO):
# """Save config as JSON to a file"""
# json.dump(self.to_dict(), config_file, indent=4)
# def get_speaker_id(self, dataset_name: str, speaker_name: str) -> int:
# if self.speaker_id_map is None:
# self.speaker_id_map = {}
# full_speaker_name = f"{dataset_name}_{speaker_name}"
# speaker_id = self.speaker_id_map.get(full_speaker_name)
# if speaker_id is None:
# speaker_id = len(self.speaker_id_map)
# self.speaker_id_map[full_speaker_name] = speaker_id
# return speaker_id
# @staticmethod
# def load(config_file: typing.TextIO) -> "TrainingConfig":
# """Load config from a JSON file"""
# return TrainingConfig.from_json(config_file.read())
# @staticmethod
# def load_and_merge(
# config: "TrainingConfig",
# config_files: typing.Iterable[typing.Union[str, Path, typing.TextIO]],
# ) -> "TrainingConfig":
# """Loads one or more JSON configuration files and overlays them on top of an existing config"""
# base_dict = config.to_dict()
# for maybe_config_file in config_files:
# if isinstance(maybe_config_file, (str, Path)):
# # File path
# config_file = open(maybe_config_file, "r", encoding="utf-8")
# else:
# # File object
# config_file = maybe_config_file
# with config_file:
# # Load new config and overlay on existing config
# new_dict = json.load(config_file)
# TrainingConfig.recursive_update(base_dict, new_dict)
# return TrainingConfig.from_dict(base_dict)
# @staticmethod
# def recursive_update(
# base_dict: typing.Dict[typing.Any, typing.Any],
# new_dict: typing.Mapping[typing.Any, typing.Any],
# ) -> None:
# """Recursively overwrites values in base dictionary with values from new dictionary"""
# for key, value in new_dict.items():
# if isinstance(value, collections.Mapping) and (
# base_dict.get(key) is not None
# ):
# TrainingConfig.recursive_update(base_dict[key], value)
# else:
# base_dict[key] = value

View File

@@ -0,0 +1,214 @@
import json
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List, Optional, Sequence, Union
import torch
from torch import FloatTensor, LongTensor
from torch.utils.data import Dataset
_LOGGER = logging.getLogger("vits.dataset")
@dataclass
class Utterance:
phoneme_ids: List[int]
audio_norm_path: Path
audio_spec_path: Path
speaker_id: Optional[int] = None
text: Optional[str] = None
@dataclass
class UtteranceTensors:
phoneme_ids: LongTensor
spectrogram: FloatTensor
audio_norm: FloatTensor
speaker_id: Optional[LongTensor] = None
text: Optional[str] = None
@property
def spec_length(self) -> int:
return self.spectrogram.size(1)
@dataclass
class Batch:
phoneme_ids: LongTensor
phoneme_lengths: LongTensor
spectrograms: FloatTensor
spectrogram_lengths: LongTensor
audios: FloatTensor
audio_lengths: LongTensor
speaker_ids: Optional[LongTensor] = None
class PiperDataset(Dataset):
"""
Dataset format:
* phoneme_ids (required)
* audio_norm_path (required)
* audio_spec_path (required)
* text (optional)
* phonemes (optional)
* audio_path (optional)
"""
def __init__(
self,
dataset_paths: List[Union[str, Path]],
max_phoneme_ids: Optional[int] = None,
):
self.utterances: List[Utterance] = []
for dataset_path in dataset_paths:
dataset_path = Path(dataset_path)
_LOGGER.debug("Loading dataset: %s", dataset_path)
self.utterances.extend(
PiperDataset.load_dataset(dataset_path, max_phoneme_ids=max_phoneme_ids)
)
def __len__(self):
return len(self.utterances)
def __getitem__(self, idx) -> UtteranceTensors:
utt = self.utterances[idx]
return UtteranceTensors(
phoneme_ids=LongTensor(utt.phoneme_ids),
audio_norm=torch.load(utt.audio_norm_path),
spectrogram=torch.load(utt.audio_spec_path),
speaker_id=LongTensor([utt.speaker_id])
if utt.speaker_id is not None
else None,
text=utt.text,
)
@staticmethod
def load_dataset(
dataset_path: Path,
max_phoneme_ids: Optional[int] = None,
) -> Iterable[Utterance]:
num_skipped = 0
with open(dataset_path, "r", encoding="utf-8") as dataset_file:
for line_idx, line in enumerate(dataset_file):
line = line.strip()
if not line:
continue
try:
utt = PiperDataset.load_utterance(line)
if (max_phoneme_ids is None) or (
len(utt.phoneme_ids) <= max_phoneme_ids
):
yield utt
else:
num_skipped += 1
except Exception:
_LOGGER.exception(
"Error on line %s of %s: %s",
line_idx + 1,
dataset_path,
line,
)
if num_skipped > 0:
_LOGGER.warning("Skipped %s utterance(s)", num_skipped)
@staticmethod
def load_utterance(line: str) -> Utterance:
utt_dict = json.loads(line)
return Utterance(
phoneme_ids=utt_dict["phoneme_ids"],
audio_norm_path=Path(utt_dict["audio_norm_path"]),
audio_spec_path=Path(utt_dict["audio_spec_path"]),
speaker_id=utt_dict.get("speaker_id"),
text=utt_dict.get("text"),
)
class UtteranceCollate:
def __init__(self, is_multispeaker: bool, segment_size: int):
self.is_multispeaker = is_multispeaker
self.segment_size = segment_size
def __call__(self, utterances: Sequence[UtteranceTensors]) -> Batch:
num_utterances = len(utterances)
assert num_utterances > 0, "No utterances"
max_phonemes_length = 0
max_spec_length = 0
max_audio_length = 0
num_mels = 0
# Determine lengths
for utt_idx, utt in enumerate(utterances):
assert utt.spectrogram is not None
assert utt.audio_norm is not None
phoneme_length = utt.phoneme_ids.size(0)
spec_length = utt.spectrogram.size(1)
audio_length = utt.audio_norm.size(1)
max_phonemes_length = max(max_phonemes_length, phoneme_length)
max_spec_length = max(max_spec_length, spec_length)
max_audio_length = max(max_audio_length, audio_length)
num_mels = utt.spectrogram.size(0)
if self.is_multispeaker:
assert utt.speaker_id is not None, "Missing speaker id"
# Audio cannot be smaller than segment size (8192)
max_audio_length = max(max_audio_length, self.segment_size)
# Create padded tensors
phonemes_padded = LongTensor(num_utterances, max_phonemes_length)
spec_padded = FloatTensor(num_utterances, num_mels, max_spec_length)
audio_padded = FloatTensor(num_utterances, 1, max_audio_length)
phonemes_padded.zero_()
spec_padded.zero_()
audio_padded.zero_()
phoneme_lengths = LongTensor(num_utterances)
spec_lengths = LongTensor(num_utterances)
audio_lengths = LongTensor(num_utterances)
speaker_ids: Optional[LongTensor] = None
if self.is_multispeaker:
speaker_ids = LongTensor(num_utterances)
# Sort by decreasing spectrogram length
sorted_utterances = sorted(
utterances, key=lambda u: u.spectrogram.size(1), reverse=True
)
for utt_idx, utt in enumerate(sorted_utterances):
phoneme_length = utt.phoneme_ids.size(0)
spec_length = utt.spectrogram.size(1)
audio_length = utt.audio_norm.size(1)
phonemes_padded[utt_idx, :phoneme_length] = utt.phoneme_ids
phoneme_lengths[utt_idx] = phoneme_length
spec_padded[utt_idx, :, :spec_length] = utt.spectrogram
spec_lengths[utt_idx] = spec_length
audio_padded[utt_idx, :, :audio_length] = utt.audio_norm
audio_lengths[utt_idx] = audio_length
if utt.speaker_id is not None:
assert speaker_ids is not None
speaker_ids[utt_idx] = utt.speaker_id
return Batch(
phoneme_ids=phonemes_padded,
phoneme_lengths=phoneme_lengths,
spectrograms=spec_padded,
spectrogram_lengths=spec_lengths,
audios=audio_padded,
audio_lengths=audio_lengths,
speaker_ids=speaker_ids,
)

View File

@@ -0,0 +1,352 @@
import logging
from pathlib import Path
from typing import List, Optional, Tuple, Union
import pytorch_lightning as pl
import torch
from torch import autocast
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from .commons import slice_segments
from .dataset import Batch, PiperDataset, UtteranceCollate
from .losses import discriminator_loss, feature_loss, generator_loss, kl_loss
from .mel_processing import mel_spectrogram_torch, spec_to_mel_torch
from .models import MultiPeriodDiscriminator, SynthesizerTrn
_LOGGER = logging.getLogger("vits.lightning")
class VitsModel(pl.LightningModule):
def __init__(
self,
num_symbols: int,
num_speakers: int,
# audio
resblock="2",
resblock_kernel_sizes=(3, 5, 7),
resblock_dilation_sizes=(
(1, 2),
(2, 6),
(3, 12),
),
upsample_rates=(8, 8, 4),
upsample_initial_channel=256,
upsample_kernel_sizes=(16, 16, 8),
# mel
filter_length: int = 1024,
hop_length: int = 256,
win_length: int = 1024,
mel_channels: int = 80,
sample_rate: int = 22050,
sample_bytes: int = 2,
channels: int = 1,
mel_fmin: float = 0.0,
mel_fmax: Optional[float] = None,
# model
inter_channels: int = 192,
hidden_channels: int = 192,
filter_channels: int = 768,
n_heads: int = 2,
n_layers: int = 6,
kernel_size: int = 3,
p_dropout: float = 0.1,
n_layers_q: int = 3,
use_spectral_norm: bool = False,
gin_channels: int = 0,
use_sdp: bool = True,
segment_size: int = 8192,
# training
dataset: Optional[List[Union[str, Path]]] = None,
learning_rate: float = 2e-4,
betas: Tuple[float, float] = (0.8, 0.99),
eps: float = 1e-9,
batch_size: int = 1,
lr_decay: float = 0.999875,
init_lr_ratio: float = 1.0,
warmup_epochs: int = 0,
c_mel: int = 45,
c_kl: float = 1.0,
grad_clip: Optional[float] = None,
num_workers: int = 1,
seed: int = 1234,
num_test_examples: int = 5,
validation_split: float = 0.1,
max_phoneme_ids: Optional[int] = None,
**kwargs,
):
super().__init__()
self.save_hyperparameters()
if (self.hparams.num_speakers > 1) and (self.hparams.gin_channels <= 0):
# Default gin_channels for multi-speaker model
self.hparams.gin_channels = 512
# Set up models
self.model_g = SynthesizerTrn(
n_vocab=self.hparams.num_symbols,
spec_channels=self.hparams.filter_length // 2 + 1,
segment_size=self.hparams.segment_size // self.hparams.hop_length,
inter_channels=self.hparams.inter_channels,
hidden_channels=self.hparams.hidden_channels,
filter_channels=self.hparams.filter_channels,
n_heads=self.hparams.n_heads,
n_layers=self.hparams.n_layers,
kernel_size=self.hparams.kernel_size,
p_dropout=self.hparams.p_dropout,
resblock=self.hparams.resblock,
resblock_kernel_sizes=self.hparams.resblock_kernel_sizes,
resblock_dilation_sizes=self.hparams.resblock_dilation_sizes,
upsample_rates=self.hparams.upsample_rates,
upsample_initial_channel=self.hparams.upsample_initial_channel,
upsample_kernel_sizes=self.hparams.upsample_kernel_sizes,
n_speakers=self.hparams.num_speakers,
gin_channels=self.hparams.gin_channels,
use_sdp=self.hparams.use_sdp,
)
self.model_d = MultiPeriodDiscriminator(
use_spectral_norm=self.hparams.use_spectral_norm
)
# Dataset splits
self._train_dataset: Optional[Dataset] = None
self._val_dataset: Optional[Dataset] = None
self._test_dataset: Optional[Dataset] = None
self._load_datasets(validation_split, num_test_examples, max_phoneme_ids)
# State kept between training optimizers
self._y = None
self._y_hat = None
def _load_datasets(
self,
validation_split: float,
num_test_examples: int,
max_phoneme_ids: Optional[int] = None,
):
if self.hparams.dataset is None:
_LOGGER.debug("No dataset to load")
return
full_dataset = PiperDataset(
self.hparams.dataset, max_phoneme_ids=max_phoneme_ids
)
valid_set_size = int(len(full_dataset) * validation_split)
train_set_size = len(full_dataset) - valid_set_size - num_test_examples
self._train_dataset, self._test_dataset, self._val_dataset = random_split(
full_dataset, [train_set_size, num_test_examples, valid_set_size]
)
def forward(self, text, text_lengths, scales, sid=None):
noise_scale = scales[0]
length_scale = scales[1]
noise_scale_w = scales[2]
audio, *_ = self.model_g.infer(
text,
text_lengths,
noise_scale=noise_scale,
length_scale=length_scale,
noise_scale_w=noise_scale_w,
sid=sid,
)
return audio
def train_dataloader(self):
return DataLoader(
self._train_dataset,
collate_fn=UtteranceCollate(
is_multispeaker=self.hparams.num_speakers > 1,
segment_size=self.hparams.segment_size,
),
num_workers=self.hparams.num_workers,
batch_size=self.hparams.batch_size,
)
def val_dataloader(self):
return DataLoader(
self._val_dataset,
collate_fn=UtteranceCollate(
is_multispeaker=self.hparams.num_speakers > 1,
segment_size=self.hparams.segment_size,
),
num_workers=self.hparams.num_workers,
batch_size=self.hparams.batch_size,
)
def test_dataloader(self):
return DataLoader(
self._test_dataset,
collate_fn=UtteranceCollate(
is_multispeaker=self.hparams.num_speakers > 1,
segment_size=self.hparams.segment_size,
),
num_workers=self.hparams.num_workers,
batch_size=self.hparams.batch_size,
)
def training_step(self, batch: Batch, batch_idx: int, optimizer_idx: int):
if optimizer_idx == 0:
return self.training_step_g(batch)
if optimizer_idx == 1:
return self.training_step_d(batch)
def training_step_g(self, batch: Batch):
x, x_lengths, y, _, spec, spec_lengths, speaker_ids = (
batch.phoneme_ids,
batch.phoneme_lengths,
batch.audios,
batch.audio_lengths,
batch.spectrograms,
batch.spectrogram_lengths,
batch.speaker_ids if batch.speaker_ids is not None else None,
)
(
y_hat,
l_length,
_attn,
ids_slice,
_x_mask,
z_mask,
(_z, z_p, m_p, logs_p, _m_q, logs_q),
) = self.model_g(x, x_lengths, spec, spec_lengths, speaker_ids)
self._y_hat = y_hat
mel = spec_to_mel_torch(
spec,
self.hparams.filter_length,
self.hparams.mel_channels,
self.hparams.sample_rate,
self.hparams.mel_fmin,
self.hparams.mel_fmax,
)
y_mel = slice_segments(
mel,
ids_slice,
self.hparams.segment_size // self.hparams.hop_length,
)
y_hat_mel = mel_spectrogram_torch(
y_hat.squeeze(1),
self.hparams.filter_length,
self.hparams.mel_channels,
self.hparams.sample_rate,
self.hparams.hop_length,
self.hparams.win_length,
self.hparams.mel_fmin,
self.hparams.mel_fmax,
)
y = slice_segments(
y,
ids_slice * self.hparams.hop_length,
self.hparams.segment_size,
) # slice
# Save for training_step_d
self._y = y
_y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = self.model_d(y, y_hat)
with autocast(self.device.type, enabled=False):
# Generator loss
loss_dur = torch.sum(l_length.float())
loss_mel = F.l1_loss(y_mel, y_hat_mel) * self.hparams.c_mel
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * self.hparams.c_kl
loss_fm = feature_loss(fmap_r, fmap_g)
loss_gen, _losses_gen = generator_loss(y_d_hat_g)
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl
self.log("loss_gen_all", loss_gen_all)
return loss_gen_all
def training_step_d(self, batch: Batch):
# From training_step_g
y = self._y
y_hat = self._y_hat
y_d_hat_r, y_d_hat_g, _, _ = self.model_d(y, y_hat.detach())
with autocast(self.device.type, enabled=False):
# Discriminator
loss_disc, _losses_disc_r, _losses_disc_g = discriminator_loss(
y_d_hat_r, y_d_hat_g
)
loss_disc_all = loss_disc
self.log("loss_disc_all", loss_disc_all)
return loss_disc_all
def validation_step(self, batch: Batch, batch_idx: int):
val_loss = self.training_step_g(batch) + self.training_step_d(batch)
self.log("val_loss", val_loss)
# Generate audio examples
for utt_idx, test_utt in enumerate(self._test_dataset):
text = test_utt.phoneme_ids.unsqueeze(0).to(self.device)
text_lengths = torch.LongTensor([len(test_utt.phoneme_ids)]).to(self.device)
scales = [0.667, 1.0, 0.8]
sid = (
test_utt.speaker_id.to(self.device)
if test_utt.speaker_id is not None
else None
)
test_audio = self(text, text_lengths, scales, sid=sid).detach()
# Scale to make louder in [-1, 1]
test_audio = test_audio * (1.0 / max(0.01, abs(test_audio.max())))
tag = test_utt.text or str(utt_idx)
self.logger.experiment.add_audio(
tag, test_audio, sample_rate=self.hparams.sample_rate
)
return val_loss
def configure_optimizers(self):
optimizers = [
torch.optim.AdamW(
self.model_g.parameters(),
lr=self.hparams.learning_rate,
betas=self.hparams.betas,
eps=self.hparams.eps,
),
torch.optim.AdamW(
self.model_d.parameters(),
lr=self.hparams.learning_rate,
betas=self.hparams.betas,
eps=self.hparams.eps,
),
]
schedulers = [
torch.optim.lr_scheduler.ExponentialLR(
optimizers[0], gamma=self.hparams.lr_decay
),
torch.optim.lr_scheduler.ExponentialLR(
optimizers[1], gamma=self.hparams.lr_decay
),
]
return optimizers, schedulers
@staticmethod
def add_model_specific_args(parent_parser):
parser = parent_parser.add_argument_group("VitsModel")
parser.add_argument("--batch-size", type=int, required=True)
parser.add_argument("--validation-split", type=float, default=0.1)
parser.add_argument("--num-test-examples", type=int, default=5)
parser.add_argument(
"--max-phoneme-ids",
type=int,
help="Exclude utterances with phoneme id lists longer than this",
)
#
parser.add_argument("--hidden-channels", type=int, default=192)
parser.add_argument("--inter-channels", type=int, default=192)
parser.add_argument("--filter-channels", type=int, default=768)
parser.add_argument("--n-layers", type=int, default=6)
parser.add_argument("--n-heads", type=int, default=2)
#
return parent_parser

View File

@@ -0,0 +1,58 @@
import torch
def feature_loss(fmap_r, fmap_g):
loss = 0
for dr, dg in zip(fmap_r, fmap_g):
for rl, gl in zip(dr, dg):
rl = rl.float().detach()
gl = gl.float()
loss += torch.mean(torch.abs(rl - gl))
return loss * 2
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
loss = 0
r_losses = []
g_losses = []
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
dr = dr.float()
dg = dg.float()
r_loss = torch.mean((1 - dr) ** 2)
g_loss = torch.mean(dg**2)
loss += r_loss + g_loss
r_losses.append(r_loss.item())
g_losses.append(g_loss.item())
return loss, r_losses, g_losses
def generator_loss(disc_outputs):
loss = 0
gen_losses = []
for dg in disc_outputs:
dg = dg.float()
l_dg = torch.mean((1 - dg) ** 2)
gen_losses.append(l_dg)
loss += l_dg
return loss, gen_losses
def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
"""
z_p, logs_q: [b, h, t_t]
m_p, logs_p: [b, h, t_t]
"""
z_p = z_p.float()
logs_q = logs_q.float()
m_p = m_p.float()
logs_p = logs_p.float()
z_mask = z_mask.float()
kl = logs_p - logs_q - 0.5
kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
kl = torch.sum(kl * z_mask)
l_kl = kl / torch.sum(z_mask)
return l_kl

View File

@@ -0,0 +1,139 @@
import torch
import torch.utils.data
from librosa.filters import mel as librosa_mel_fn
MAX_WAV_VALUE = 32768.0
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
"""
PARAMS
------
C: compression factor
"""
return torch.log(torch.clamp(x, min=clip_val) * C)
def dynamic_range_decompression_torch(x, C=1):
"""
PARAMS
------
C: compression factor used to compress
"""
return torch.exp(x) / C
def spectral_normalize_torch(magnitudes):
output = dynamic_range_compression_torch(magnitudes)
return output
def spectral_de_normalize_torch(magnitudes):
output = dynamic_range_decompression_torch(magnitudes)
return output
mel_basis = {}
hann_window = {}
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
if torch.min(y) < -1.0:
print("min value is ", torch.min(y))
if torch.max(y) > 1.0:
print("max value is ", torch.max(y))
global hann_window
dtype_device = str(y.dtype) + "_" + str(y.device)
wnsize_dtype_device = str(win_size) + "_" + dtype_device
if wnsize_dtype_device not in hann_window:
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).type_as(y)
y = torch.nn.functional.pad(
y.unsqueeze(1),
(int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
mode="reflect",
)
y = y.squeeze(1)
spec = torch.view_as_real(
torch.stft(
y,
n_fft,
hop_length=hop_size,
win_length=win_size,
window=hann_window[wnsize_dtype_device],
center=center,
pad_mode="reflect",
normalized=False,
onesided=True,
return_complex=True,
)
)
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
return spec
def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
global mel_basis
dtype_device = str(spec.dtype) + "_" + str(spec.device)
fmax_dtype_device = str(fmax) + "_" + dtype_device
if fmax_dtype_device not in mel_basis:
mel = librosa_mel_fn(
sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
)
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).type_as(spec)
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
spec = spectral_normalize_torch(spec)
return spec
def mel_spectrogram_torch(
y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
):
if torch.min(y) < -1.0:
print("min value is ", torch.min(y))
if torch.max(y) > 1.0:
print("max value is ", torch.max(y))
global mel_basis, hann_window
dtype_device = str(y.dtype) + "_" + str(y.device)
fmax_dtype_device = str(fmax) + "_" + dtype_device
wnsize_dtype_device = str(win_size) + "_" + dtype_device
if fmax_dtype_device not in mel_basis:
mel = librosa_mel_fn(
sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
)
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).type_as(y)
if wnsize_dtype_device not in hann_window:
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).type_as(y)
y = torch.nn.functional.pad(
y.unsqueeze(1),
(int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
mode="reflect",
)
y = y.squeeze(1)
spec = torch.view_as_real(
torch.stft(
y,
n_fft,
hop_length=hop_size,
win_length=win_size,
window=hann_window[wnsize_dtype_device],
center=center,
pad_mode="reflect",
normalized=False,
onesided=True,
return_complex=True,
)
)
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
spec = spectral_normalize_torch(spec)
return spec

View File

@@ -0,0 +1,732 @@
import math
import typing
import torch
from torch import nn
from torch.nn import Conv1d, Conv2d, ConvTranspose1d
from torch.nn import functional as F
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
from . import attentions, commons, modules, monotonic_align
from .commons import get_padding, init_weights
class StochasticDurationPredictor(nn.Module):
def __init__(
self,
in_channels: int,
filter_channels: int,
kernel_size: int,
p_dropout: float,
n_flows: int = 4,
gin_channels: int = 0,
):
super().__init__()
filter_channels = in_channels # it needs to be removed from future version.
self.in_channels = in_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.n_flows = n_flows
self.gin_channels = gin_channels
self.log_flow = modules.Log()
self.flows = nn.ModuleList()
self.flows.append(modules.ElementwiseAffine(2))
for i in range(n_flows):
self.flows.append(
modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
)
self.flows.append(modules.Flip())
self.post_pre = nn.Conv1d(1, filter_channels, 1)
self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
self.post_convs = modules.DDSConv(
filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
)
self.post_flows = nn.ModuleList()
self.post_flows.append(modules.ElementwiseAffine(2))
for i in range(4):
self.post_flows.append(
modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
)
self.post_flows.append(modules.Flip())
self.pre = nn.Conv1d(in_channels, filter_channels, 1)
self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
self.convs = modules.DDSConv(
filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
)
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
x = torch.detach(x)
x = self.pre(x)
if g is not None:
g = torch.detach(g)
x = x + self.cond(g)
x = self.convs(x, x_mask)
x = self.proj(x) * x_mask
if not reverse:
flows = self.flows
assert w is not None
logdet_tot_q = 0
h_w = self.post_pre(w)
h_w = self.post_convs(h_w, x_mask)
h_w = self.post_proj(h_w) * x_mask
e_q = torch.randn(w.size(0), 2, w.size(2)).type_as(x) * x_mask
z_q = e_q
for flow in self.post_flows:
z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
logdet_tot_q += logdet_q
z_u, z1 = torch.split(z_q, [1, 1], 1)
u = torch.sigmoid(z_u) * x_mask
z0 = (w - u) * x_mask
logdet_tot_q += torch.sum(
(F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
)
logq = (
torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2])
- logdet_tot_q
)
logdet_tot = 0
z0, logdet = self.log_flow(z0, x_mask)
logdet_tot += logdet
z = torch.cat([z0, z1], 1)
for flow in flows:
z, logdet = flow(z, x_mask, g=x, reverse=reverse)
logdet_tot = logdet_tot + logdet
nll = (
torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2])
- logdet_tot
)
return nll + logq # [b]
else:
flows = list(reversed(self.flows))
flows = flows[:-2] + [flows[-1]] # remove a useless vflow
z = torch.randn(x.size(0), 2, x.size(2)).type_as(x) * noise_scale
for flow in flows:
z = flow(z, x_mask, g=x, reverse=reverse)
z0, z1 = torch.split(z, [1, 1], 1)
logw = z0
return logw
class DurationPredictor(nn.Module):
def __init__(
self,
in_channels: int,
filter_channels: int,
kernel_size: int,
p_dropout: float,
gin_channels: int = 0,
):
super().__init__()
self.in_channels = in_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.gin_channels = gin_channels
self.drop = nn.Dropout(p_dropout)
self.conv_1 = nn.Conv1d(
in_channels, filter_channels, kernel_size, padding=kernel_size // 2
)
self.norm_1 = modules.LayerNorm(filter_channels)
self.conv_2 = nn.Conv1d(
filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
)
self.norm_2 = modules.LayerNorm(filter_channels)
self.proj = nn.Conv1d(filter_channels, 1, 1)
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, in_channels, 1)
def forward(self, x, x_mask, g=None):
x = torch.detach(x)
if g is not None:
g = torch.detach(g)
x = x + self.cond(g)
x = self.conv_1(x * x_mask)
x = torch.relu(x)
x = self.norm_1(x)
x = self.drop(x)
x = self.conv_2(x * x_mask)
x = torch.relu(x)
x = self.norm_2(x)
x = self.drop(x)
x = self.proj(x * x_mask)
return x * x_mask
class TextEncoder(nn.Module):
def __init__(
self,
n_vocab: int,
out_channels: int,
hidden_channels: int,
filter_channels: int,
n_heads: int,
n_layers: int,
kernel_size: int,
p_dropout: float,
):
super().__init__()
self.n_vocab = n_vocab
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.emb = nn.Embedding(n_vocab, hidden_channels)
nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, x, x_lengths):
x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(
commons.sequence_mask(x_lengths, x.size(2)), 1
).type_as(x)
x = self.encoder(x * x_mask, x_mask)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
return x, m, logs, x_mask
class ResidualCouplingBlock(nn.Module):
def __init__(
self,
channels: int,
hidden_channels: int,
kernel_size: int,
dilation_rate: int,
n_layers: int,
n_flows: int = 4,
gin_channels: int = 0,
):
super().__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.n_flows = n_flows
self.gin_channels = gin_channels
self.flows = nn.ModuleList()
for i in range(n_flows):
self.flows.append(
modules.ResidualCouplingLayer(
channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=gin_channels,
mean_only=True,
)
)
self.flows.append(modules.Flip())
def forward(self, x, x_mask, g=None, reverse=False):
if not reverse:
for flow in self.flows:
x, _ = flow(x, x_mask, g=g, reverse=reverse)
else:
for flow in reversed(self.flows):
x = flow(x, x_mask, g=g, reverse=reverse)
return x
class PosteriorEncoder(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
hidden_channels: int,
kernel_size: int,
dilation_rate: int,
n_layers: int,
gin_channels: int = 0,
):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
self.enc = modules.WN(
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=gin_channels,
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, x, x_lengths, g=None):
x_mask = torch.unsqueeze(
commons.sequence_mask(x_lengths, x.size(2)), 1
).type_as(x)
x = self.pre(x) * x_mask
x = self.enc(x, x_mask, g=g)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
return z, m, logs, x_mask
class Generator(torch.nn.Module):
def __init__(
self,
initial_channel: int,
resblock: typing.Optional[str],
resblock_kernel_sizes: typing.Tuple[int, ...],
resblock_dilation_sizes: typing.Tuple[typing.Tuple[int, ...], ...],
upsample_rates: typing.Tuple[int, ...],
upsample_initial_channel: int,
upsample_kernel_sizes: typing.Tuple[int, ...],
gin_channels: int = 0,
):
super(Generator, self).__init__()
self.LRELU_SLOPE = 0.1
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
self.conv_pre = Conv1d(
initial_channel, upsample_initial_channel, 7, 1, padding=3
)
resblock_module = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
self.ups.append(
weight_norm(
ConvTranspose1d(
upsample_initial_channel // (2**i),
upsample_initial_channel // (2 ** (i + 1)),
k,
u,
padding=(k - u) // 2,
)
)
)
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = upsample_initial_channel // (2 ** (i + 1))
for j, (k, d) in enumerate(
zip(resblock_kernel_sizes, resblock_dilation_sizes)
):
self.resblocks.append(resblock_module(ch, k, d))
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
self.ups.apply(init_weights)
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
def forward(self, x, g=None):
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
for i, up in enumerate(self.ups):
x = F.leaky_relu(x, self.LRELU_SLOPE)
x = up(x)
xs = torch.zeros(1)
for j, resblock in enumerate(self.resblocks):
index = j - (i * self.num_kernels)
if index == 0:
xs = resblock(x)
elif (index > 0) and (index < self.num_kernels):
xs += resblock(x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def remove_weight_norm(self):
print("Removing weight norm...")
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
class DiscriminatorP(torch.nn.Module):
def __init__(
self,
period: int,
kernel_size: int = 5,
stride: int = 3,
use_spectral_norm: bool = False,
):
super(DiscriminatorP, self).__init__()
self.LRELU_SLOPE = 0.1
self.period = period
self.use_spectral_norm = use_spectral_norm
norm_f = weight_norm if not use_spectral_norm else spectral_norm
self.convs = nn.ModuleList(
[
norm_f(
Conv2d(
1,
32,
(kernel_size, 1),
(stride, 1),
padding=(get_padding(kernel_size, 1), 0),
)
),
norm_f(
Conv2d(
32,
128,
(kernel_size, 1),
(stride, 1),
padding=(get_padding(kernel_size, 1), 0),
)
),
norm_f(
Conv2d(
128,
512,
(kernel_size, 1),
(stride, 1),
padding=(get_padding(kernel_size, 1), 0),
)
),
norm_f(
Conv2d(
512,
1024,
(kernel_size, 1),
(stride, 1),
padding=(get_padding(kernel_size, 1), 0),
)
),
norm_f(
Conv2d(
1024,
1024,
(kernel_size, 1),
1,
padding=(get_padding(kernel_size, 1), 0),
)
),
]
)
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
def forward(self, x):
fmap = []
# 1d to 2d
b, c, t = x.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x = F.pad(x, (0, n_pad), "reflect")
t = t + n_pad
x = x.view(b, c, t // self.period, self.period)
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, self.LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
self.LRELU_SLOPE = 0.1
norm_f = spectral_norm if use_spectral_norm else weight_norm
self.convs = nn.ModuleList(
[
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
]
)
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
def forward(self, x):
fmap = []
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, self.LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(MultiPeriodDiscriminator, self).__init__()
periods = [2, 3, 5, 7, 11]
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
discs = discs + [
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
]
self.discriminators = nn.ModuleList(discs)
def forward(self, y, y_hat):
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
y_d_gs.append(y_d_g)
fmap_rs.append(fmap_r)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class SynthesizerTrn(nn.Module):
"""
Synthesizer for Training
"""
def __init__(
self,
n_vocab: int,
spec_channels: int,
segment_size: int,
inter_channels: int,
hidden_channels: int,
filter_channels: int,
n_heads: int,
n_layers: int,
kernel_size: int,
p_dropout: float,
resblock: str,
resblock_kernel_sizes: typing.Tuple[int, ...],
resblock_dilation_sizes: typing.Tuple[typing.Tuple[int, ...], ...],
upsample_rates: typing.Tuple[int, ...],
upsample_initial_channel: int,
upsample_kernel_sizes: typing.Tuple[int, ...],
n_speakers: int = 1,
gin_channels: int = 0,
use_sdp: bool = True,
):
super().__init__()
self.n_vocab = n_vocab
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.upsample_rates = upsample_rates
self.upsample_initial_channel = upsample_initial_channel
self.upsample_kernel_sizes = upsample_kernel_sizes
self.segment_size = segment_size
self.n_speakers = n_speakers
self.gin_channels = gin_channels
self.use_sdp = use_sdp
self.enc_p = TextEncoder(
n_vocab,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
)
self.dec = Generator(
inter_channels,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=gin_channels,
)
self.enc_q = PosteriorEncoder(
spec_channels,
inter_channels,
hidden_channels,
5,
1,
16,
gin_channels=gin_channels,
)
self.flow = ResidualCouplingBlock(
inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels
)
if use_sdp:
self.dp = StochasticDurationPredictor(
hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
)
else:
self.dp = DurationPredictor(
hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
)
if n_speakers > 1:
self.emb_g = nn.Embedding(n_speakers, gin_channels)
def forward(self, x, x_lengths, y, y_lengths, sid=None):
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
if self.n_speakers > 1:
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
else:
g = None
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
z_p = self.flow(z, y_mask, g=g)
with torch.no_grad():
# negative cross-entropy
s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
neg_cent1 = torch.sum(
-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True
) # [b, 1, t_s]
neg_cent2 = torch.matmul(
-0.5 * (z_p**2).transpose(1, 2), s_p_sq_r
) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
neg_cent3 = torch.matmul(
z_p.transpose(1, 2), (m_p * s_p_sq_r)
) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
neg_cent4 = torch.sum(
-0.5 * (m_p**2) * s_p_sq_r, [1], keepdim=True
) # [b, 1, t_s]
neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
attn = (
monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1))
.unsqueeze(1)
.detach()
)
w = attn.sum(2)
if self.use_sdp:
l_length = self.dp(x, x_mask, w, g=g)
l_length = l_length / torch.sum(x_mask)
else:
logw_ = torch.log(w + 1e-6) * x_mask
logw = self.dp(x, x_mask, g=g)
l_length = torch.sum((logw - logw_) ** 2, [1, 2]) / torch.sum(
x_mask
) # for averaging
# expand prior
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
z_slice, ids_slice = commons.rand_slice_segments(
z, y_lengths, self.segment_size
)
o = self.dec(z_slice, g=g)
return (
o,
l_length,
attn,
ids_slice,
x_mask,
y_mask,
(z, z_p, m_p, logs_p, m_q, logs_q),
)
def infer(
self,
x,
x_lengths,
sid=None,
noise_scale=0.667,
length_scale=1,
noise_scale_w=0.8,
max_len=None,
):
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
if self.n_speakers > 1:
assert sid is not None, "Missing speaker id"
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
else:
g = None
if self.use_sdp:
logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
else:
logw = self.dp(x, x_mask, g=g)
w = torch.exp(logw) * x_mask * length_scale
w_ceil = torch.ceil(w)
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
y_mask = torch.unsqueeze(
commons.sequence_mask(y_lengths, y_lengths.max()), 1
).type_as(x_mask)
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
attn = commons.generate_path(w_ceil, attn_mask)
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
1, 2
) # [b, t', t], [b, t, d] -> [b, d, t']
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
1, 2
) # [b, t', t], [b, t, d] -> [b, d, t']
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
z = self.flow(z_p, y_mask, g=g, reverse=True)
o = self.dec((z * y_mask)[:, :, :max_len], g=g)
return o, attn, y_mask, (z, z_p, m_p, logs_p)
def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
assert self.n_speakers > 1, "n_speakers have to be larger than 1."
g_src = self.emb_g(sid_src).unsqueeze(-1)
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
z_p = self.flow(z, y_mask, g=g_src)
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
o_hat = self.dec(z_hat * y_mask, g=g_tgt)
return o_hat, y_mask, (z, z_p, z_hat)

View File

@@ -0,0 +1,527 @@
import math
import typing
import torch
from torch import nn
from torch.nn import Conv1d
from torch.nn import functional as F
from torch.nn.utils import remove_weight_norm, weight_norm
from .commons import fused_add_tanh_sigmoid_multiply, get_padding, init_weights
from .transforms import piecewise_rational_quadratic_transform
class LayerNorm(nn.Module):
def __init__(self, channels: int, eps: float = 1e-5):
super().__init__()
self.channels = channels
self.eps = eps
self.gamma = nn.Parameter(torch.ones(channels))
self.beta = nn.Parameter(torch.zeros(channels))
def forward(self, x):
x = x.transpose(1, -1)
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
return x.transpose(1, -1)
class ConvReluNorm(nn.Module):
def __init__(
self,
in_channels: int,
hidden_channels: int,
out_channels: int,
kernel_size: int,
n_layers: int,
p_dropout: float,
):
super().__init__()
self.in_channels = in_channels
self.hidden_channels = hidden_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.p_dropout = p_dropout
assert n_layers > 1, "Number of layers should be larger than 0."
self.conv_layers = nn.ModuleList()
self.norm_layers = nn.ModuleList()
self.conv_layers.append(
nn.Conv1d(
in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
)
)
self.norm_layers.append(LayerNorm(hidden_channels))
self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
for _ in range(n_layers - 1):
self.conv_layers.append(
nn.Conv1d(
hidden_channels,
hidden_channels,
kernel_size,
padding=kernel_size // 2,
)
)
self.norm_layers.append(LayerNorm(hidden_channels))
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
self.proj.weight.data.zero_()
self.proj.bias.data.zero_()
def forward(self, x, x_mask):
x_org = x
for i in range(self.n_layers):
x = self.conv_layers[i](x * x_mask)
x = self.norm_layers[i](x)
x = self.relu_drop(x)
x = x_org + self.proj(x)
return x * x_mask
class DDSConv(nn.Module):
"""
Dialted and Depth-Separable Convolution
"""
def __init__(
self, channels: int, kernel_size: int, n_layers: int, p_dropout: float = 0.0
):
super().__init__()
self.channels = channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.p_dropout = p_dropout
self.drop = nn.Dropout(p_dropout)
self.convs_sep = nn.ModuleList()
self.convs_1x1 = nn.ModuleList()
self.norms_1 = nn.ModuleList()
self.norms_2 = nn.ModuleList()
for i in range(n_layers):
dilation = kernel_size**i
padding = (kernel_size * dilation - dilation) // 2
self.convs_sep.append(
nn.Conv1d(
channels,
channels,
kernel_size,
groups=channels,
dilation=dilation,
padding=padding,
)
)
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
self.norms_1.append(LayerNorm(channels))
self.norms_2.append(LayerNorm(channels))
def forward(self, x, x_mask, g=None):
if g is not None:
x = x + g
for i in range(self.n_layers):
y = self.convs_sep[i](x * x_mask)
y = self.norms_1[i](y)
y = F.gelu(y)
y = self.convs_1x1[i](y)
y = self.norms_2[i](y)
y = F.gelu(y)
y = self.drop(y)
x = x + y
return x * x_mask
class WN(torch.nn.Module):
def __init__(
self,
hidden_channels: int,
kernel_size: int,
dilation_rate: int,
n_layers: int,
gin_channels: int = 0,
p_dropout: float = 0,
):
super().__init__()
assert kernel_size % 2 == 1
self.hidden_channels = hidden_channels
self.kernel_size = (kernel_size,)
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.p_dropout = p_dropout
self.in_layers = torch.nn.ModuleList()
self.res_skip_layers = torch.nn.ModuleList()
self.drop = nn.Dropout(p_dropout)
if gin_channels != 0:
cond_layer = torch.nn.Conv1d(
gin_channels, 2 * hidden_channels * n_layers, 1
)
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
for i in range(n_layers):
dilation = dilation_rate**i
padding = int((kernel_size * dilation - dilation) / 2)
in_layer = torch.nn.Conv1d(
hidden_channels,
2 * hidden_channels,
kernel_size,
dilation=dilation,
padding=padding,
)
in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
self.in_layers.append(in_layer)
# last one is not necessary
if i < n_layers - 1:
res_skip_channels = 2 * hidden_channels
else:
res_skip_channels = hidden_channels
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
self.res_skip_layers.append(res_skip_layer)
def forward(self, x, x_mask, g=None, **kwargs):
output = torch.zeros_like(x)
n_channels_tensor = torch.IntTensor([self.hidden_channels])
if g is not None:
g = self.cond_layer(g)
for i in range(self.n_layers):
x_in = self.in_layers[i](x)
if g is not None:
cond_offset = i * 2 * self.hidden_channels
g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
else:
g_l = torch.zeros_like(x_in)
acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
acts = self.drop(acts)
res_skip_acts = self.res_skip_layers[i](acts)
if i < self.n_layers - 1:
res_acts = res_skip_acts[:, : self.hidden_channels, :]
x = (x + res_acts) * x_mask
output = output + res_skip_acts[:, self.hidden_channels :, :]
else:
output = output + res_skip_acts
return output * x_mask
def remove_weight_norm(self):
if self.gin_channels != 0:
torch.nn.utils.remove_weight_norm(self.cond_layer)
for l in self.in_layers:
torch.nn.utils.remove_weight_norm(l)
for l in self.res_skip_layers:
torch.nn.utils.remove_weight_norm(l)
class ResBlock1(torch.nn.Module):
def __init__(
self,
channels: int,
kernel_size: int = 3,
dilation: typing.Tuple[int] = (1, 3, 5),
):
super(ResBlock1, self).__init__()
self.LRELU_SLOPE = 0.1
self.convs1 = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1]),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[2],
padding=get_padding(kernel_size, dilation[2]),
)
),
]
)
self.convs1.apply(init_weights)
self.convs2 = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=1,
padding=get_padding(kernel_size, 1),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=1,
padding=get_padding(kernel_size, 1),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=1,
padding=get_padding(kernel_size, 1),
)
),
]
)
self.convs2.apply(init_weights)
def forward(self, x, x_mask=None):
for c1, c2 in zip(self.convs1, self.convs2):
xt = F.leaky_relu(x, self.LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c1(xt)
xt = F.leaky_relu(xt, self.LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c2(xt)
x = xt + x
if x_mask is not None:
x = x * x_mask
return x
def remove_weight_norm(self):
for l in self.convs1:
remove_weight_norm(l)
for l in self.convs2:
remove_weight_norm(l)
class ResBlock2(torch.nn.Module):
def __init__(
self, channels: int, kernel_size: int = 3, dilation: typing.Tuple[int] = (1, 3)
):
super(ResBlock2, self).__init__()
self.LRELU_SLOPE = 0.1
self.convs = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]),
)
),
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1]),
)
),
]
)
self.convs.apply(init_weights)
def forward(self, x, x_mask=None):
for c in self.convs:
xt = F.leaky_relu(x, self.LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c(xt)
x = xt + x
if x_mask is not None:
x = x * x_mask
return x
def remove_weight_norm(self):
for l in self.convs:
remove_weight_norm(l)
class Log(nn.Module):
def forward(
self, x: torch.Tensor, x_mask: torch.Tensor, reverse: bool = False, **kwargs
):
if not reverse:
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
logdet = torch.sum(-y, [1, 2])
return y, logdet
else:
x = torch.exp(x) * x_mask
return x
class Flip(nn.Module):
def forward(self, x: torch.Tensor, *args, reverse: bool = False, **kwargs):
x = torch.flip(x, [1])
if not reverse:
logdet = torch.zeros(x.size(0)).type_as(x)
return x, logdet
else:
return x
class ElementwiseAffine(nn.Module):
def __init__(self, channels: int):
super().__init__()
self.channels = channels
self.m = nn.Parameter(torch.zeros(channels, 1))
self.logs = nn.Parameter(torch.zeros(channels, 1))
def forward(self, x, x_mask, reverse=False, **kwargs):
if not reverse:
y = self.m + torch.exp(self.logs) * x
y = y * x_mask
logdet = torch.sum(self.logs * x_mask, [1, 2])
return y, logdet
else:
x = (x - self.m) * torch.exp(-self.logs) * x_mask
return x
class ResidualCouplingLayer(nn.Module):
def __init__(
self,
channels: int,
hidden_channels: int,
kernel_size: int,
dilation_rate: int,
n_layers: int,
p_dropout: float = 0,
gin_channels: int = 0,
mean_only: bool = False,
):
assert channels % 2 == 0, "channels should be divisible by 2"
super().__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.half_channels = channels // 2
self.mean_only = mean_only
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
self.enc = WN(
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
p_dropout=p_dropout,
gin_channels=gin_channels,
)
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
self.post.weight.data.zero_()
self.post.bias.data.zero_()
def forward(self, x, x_mask, g=None, reverse=False):
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
h = self.pre(x0) * x_mask
h = self.enc(h, x_mask, g=g)
stats = self.post(h) * x_mask
if not self.mean_only:
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
else:
m = stats
logs = torch.zeros_like(m)
if not reverse:
x1 = m + x1 * torch.exp(logs) * x_mask
x = torch.cat([x0, x1], 1)
logdet = torch.sum(logs, [1, 2])
return x, logdet
else:
x1 = (x1 - m) * torch.exp(-logs) * x_mask
x = torch.cat([x0, x1], 1)
return x
class ConvFlow(nn.Module):
def __init__(
self,
in_channels: int,
filter_channels: int,
kernel_size: int,
n_layers: int,
num_bins: int = 10,
tail_bound: float = 5.0,
):
super().__init__()
self.in_channels = in_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.num_bins = num_bins
self.tail_bound = tail_bound
self.half_channels = in_channels // 2
self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
self.proj = nn.Conv1d(
filter_channels, self.half_channels * (num_bins * 3 - 1), 1
)
self.proj.weight.data.zero_()
self.proj.bias.data.zero_()
def forward(self, x, x_mask, g=None, reverse=False):
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
h = self.pre(x0)
h = self.convs(h, x_mask, g=g)
h = self.proj(h) * x_mask
b, c, t = x0.shape
h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
self.filter_channels
)
unnormalized_derivatives = h[..., 2 * self.num_bins :]
x1, logabsdet = piecewise_rational_quadratic_transform(
x1,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=reverse,
tails="linear",
tail_bound=self.tail_bound,
)
x = torch.cat([x0, x1], 1) * x_mask
logdet = torch.sum(logabsdet * x_mask, [1, 2])
if not reverse:
return x, logdet
else:
return x

View File

@@ -0,0 +1,2 @@
all:
python3 setup.py build_ext --inplace

View File

@@ -0,0 +1,20 @@
import numpy as np
import torch
from .monotonic_align.core import maximum_path_c
def maximum_path(neg_cent, mask):
"""Cython optimized version.
neg_cent: [b, t_t, t_s]
mask: [b, t_t, t_s]
"""
device = neg_cent.device
dtype = neg_cent.dtype
neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
path = np.zeros(neg_cent.shape, dtype=np.int32)
t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
maximum_path_c(path, neg_cent, t_t_max, t_s_max)
return torch.from_numpy(path).to(device=device, dtype=dtype)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,42 @@
cimport cython
from cython.parallel import prange
@cython.boundscheck(False)
@cython.wraparound(False)
cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
cdef int x
cdef int y
cdef float v_prev
cdef float v_cur
cdef float tmp
cdef int index = t_x - 1
for y in range(t_y):
for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
if x == y:
v_cur = max_neg_val
else:
v_cur = value[y-1, x]
if x == 0:
if y == 0:
v_prev = 0.
else:
v_prev = max_neg_val
else:
v_prev = value[y-1, x-1]
value[y, x] += max(v_prev, v_cur)
for y in range(t_y - 1, -1, -1):
path[y, index] = 1
if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
index = index - 1
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
cdef int b = paths.shape[0]
cdef int i
for i in prange(b, nogil=True):
maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])

View File

@@ -0,0 +1,13 @@
from distutils.core import setup
from pathlib import Path
import numpy
from Cython.Build import cythonize
_DIR = Path(__file__).parent
setup(
name="monotonic_align",
ext_modules=cythonize(str(_DIR / "core.pyx")),
include_dirs=[numpy.get_include()],
)

View File

@@ -0,0 +1,212 @@
import numpy as np
import torch
from torch.nn import functional as F
DEFAULT_MIN_BIN_WIDTH = 1e-3
DEFAULT_MIN_BIN_HEIGHT = 1e-3
DEFAULT_MIN_DERIVATIVE = 1e-3
def piecewise_rational_quadratic_transform(
inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails=None,
tail_bound=1.0,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE,
):
if tails is None:
spline_fn = rational_quadratic_spline
spline_kwargs = {}
else:
spline_fn = unconstrained_rational_quadratic_spline
spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
outputs, logabsdet = spline_fn(
inputs=inputs,
unnormalized_widths=unnormalized_widths,
unnormalized_heights=unnormalized_heights,
unnormalized_derivatives=unnormalized_derivatives,
inverse=inverse,
min_bin_width=min_bin_width,
min_bin_height=min_bin_height,
min_derivative=min_derivative,
**spline_kwargs
)
return outputs, logabsdet
def searchsorted(bin_locations, inputs, eps=1e-6):
# bin_locations[..., -1] += eps
bin_locations[..., bin_locations.size(-1) - 1] += eps
return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
def unconstrained_rational_quadratic_spline(
inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails="linear",
tail_bound=1.0,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE,
):
inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
outside_interval_mask = ~inside_interval_mask
outputs = torch.zeros_like(inputs)
logabsdet = torch.zeros_like(inputs)
if tails == "linear":
unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
constant = np.log(np.exp(1 - min_derivative) - 1)
unnormalized_derivatives[..., 0] = constant
# unnormalized_derivatives[..., -1] = constant
unnormalized_derivatives[..., unnormalized_derivatives.size(-1) - 1] = constant
outputs[outside_interval_mask] = inputs[outside_interval_mask]
logabsdet[outside_interval_mask] = 0
else:
raise RuntimeError("{} tails are not implemented.".format(tails))
(
outputs[inside_interval_mask],
logabsdet[inside_interval_mask],
) = rational_quadratic_spline(
inputs=inputs[inside_interval_mask],
unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
inverse=inverse,
left=-tail_bound,
right=tail_bound,
bottom=-tail_bound,
top=tail_bound,
min_bin_width=min_bin_width,
min_bin_height=min_bin_height,
min_derivative=min_derivative,
)
return outputs, logabsdet
def rational_quadratic_spline(
inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
left=0.0,
right=1.0,
bottom=0.0,
top=1.0,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE,
):
# if torch.min(inputs) < left or torch.max(inputs) > right:
# raise ValueError("Input to a transform is not within its domain")
num_bins = unnormalized_widths.shape[-1]
# if min_bin_width * num_bins > 1.0:
# raise ValueError("Minimal bin width too large for the number of bins")
# if min_bin_height * num_bins > 1.0:
# raise ValueError("Minimal bin height too large for the number of bins")
widths = F.softmax(unnormalized_widths, dim=-1)
widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
cumwidths = torch.cumsum(widths, dim=-1)
cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
cumwidths = (right - left) * cumwidths + left
cumwidths[..., 0] = left
# cumwidths[..., -1] = right
cumwidths[..., cumwidths.size(-1) - 1] = right
widths = cumwidths[..., 1:] - cumwidths[..., :-1]
derivatives = min_derivative + F.softplus(unnormalized_derivatives)
heights = F.softmax(unnormalized_heights, dim=-1)
heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
cumheights = torch.cumsum(heights, dim=-1)
cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
cumheights = (top - bottom) * cumheights + bottom
cumheights[..., 0] = bottom
# cumheights[..., -1] = top
cumheights[..., cumheights.size(-1) - 1] = top
heights = cumheights[..., 1:] - cumheights[..., :-1]
if inverse:
bin_idx = searchsorted(cumheights, inputs)[..., None]
else:
bin_idx = searchsorted(cumwidths, inputs)[..., None]
input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
delta = heights / widths
input_delta = delta.gather(-1, bin_idx)[..., 0]
input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
input_heights = heights.gather(-1, bin_idx)[..., 0]
if inverse:
a = (inputs - input_cumheights) * (
input_derivatives + input_derivatives_plus_one - 2 * input_delta
) + input_heights * (input_delta - input_derivatives)
b = input_heights * input_derivatives - (inputs - input_cumheights) * (
input_derivatives + input_derivatives_plus_one - 2 * input_delta
)
c = -input_delta * (inputs - input_cumheights)
discriminant = b.pow(2) - 4 * a * c
assert (discriminant >= 0).all(), discriminant
root = (2 * c) / (-b - torch.sqrt(discriminant))
outputs = root * input_bin_widths + input_cumwidths
theta_one_minus_theta = root * (1 - root)
denominator = input_delta + (
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta
)
derivative_numerator = input_delta.pow(2) * (
input_derivatives_plus_one * root.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - root).pow(2)
)
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, -logabsdet
theta = (inputs - input_cumwidths) / input_bin_widths
theta_one_minus_theta = theta * (1 - theta)
numerator = input_heights * (
input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
)
denominator = input_delta + (
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta
)
outputs = input_cumheights + numerator / denominator
derivative_numerator = input_delta.pow(2) * (
input_derivatives_plus_one * theta.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - theta).pow(2)
)
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, logabsdet

View File

@@ -0,0 +1,16 @@
import numpy as np
import torch
def to_gpu(x: torch.Tensor) -> torch.Tensor:
return x.contiguous().cuda(non_blocking=True)
def audio_float_to_int16(
audio: np.ndarray, max_wav_value: float = 32767.0
) -> np.ndarray:
"""Normalize audio and convert to int16 range"""
audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
audio_norm = audio_norm.astype("int16")
return audio_norm

View File

@@ -0,0 +1,860 @@
"""
Module to read / write wav files using NumPy arrays
Functions
---------
`read`: Return the sample rate (in samples/sec) and data from a WAV file.
`write`: Write a NumPy array as a WAV file.
"""
import io
import struct
import sys
import warnings
from enum import IntEnum
import numpy
__all__ = ["WavFileWarning", "read", "write"]
class WavFileWarning(UserWarning):
pass
class WAVE_FORMAT(IntEnum):
"""
WAVE form wFormatTag IDs
Complete list is in mmreg.h in Windows 10 SDK. ALAC and OPUS are the
newest additions, in v10.0.14393 2016-07
"""
UNKNOWN = 0x0000
PCM = 0x0001
ADPCM = 0x0002
IEEE_FLOAT = 0x0003
VSELP = 0x0004
IBM_CVSD = 0x0005
ALAW = 0x0006
MULAW = 0x0007
DTS = 0x0008
DRM = 0x0009
WMAVOICE9 = 0x000A
WMAVOICE10 = 0x000B
OKI_ADPCM = 0x0010
DVI_ADPCM = 0x0011
IMA_ADPCM = 0x0011 # Duplicate
MEDIASPACE_ADPCM = 0x0012
SIERRA_ADPCM = 0x0013
G723_ADPCM = 0x0014
DIGISTD = 0x0015
DIGIFIX = 0x0016
DIALOGIC_OKI_ADPCM = 0x0017
MEDIAVISION_ADPCM = 0x0018
CU_CODEC = 0x0019
HP_DYN_VOICE = 0x001A
YAMAHA_ADPCM = 0x0020
SONARC = 0x0021
DSPGROUP_TRUESPEECH = 0x0022
ECHOSC1 = 0x0023
AUDIOFILE_AF36 = 0x0024
APTX = 0x0025
AUDIOFILE_AF10 = 0x0026
PROSODY_1612 = 0x0027
LRC = 0x0028
DOLBY_AC2 = 0x0030
GSM610 = 0x0031
MSNAUDIO = 0x0032
ANTEX_ADPCME = 0x0033
CONTROL_RES_VQLPC = 0x0034
DIGIREAL = 0x0035
DIGIADPCM = 0x0036
CONTROL_RES_CR10 = 0x0037
NMS_VBXADPCM = 0x0038
CS_IMAADPCM = 0x0039
ECHOSC3 = 0x003A
ROCKWELL_ADPCM = 0x003B
ROCKWELL_DIGITALK = 0x003C
XEBEC = 0x003D
G721_ADPCM = 0x0040
G728_CELP = 0x0041
MSG723 = 0x0042
INTEL_G723_1 = 0x0043
INTEL_G729 = 0x0044
SHARP_G726 = 0x0045
MPEG = 0x0050
RT24 = 0x0052
PAC = 0x0053
MPEGLAYER3 = 0x0055
LUCENT_G723 = 0x0059
CIRRUS = 0x0060
ESPCM = 0x0061
VOXWARE = 0x0062
CANOPUS_ATRAC = 0x0063
G726_ADPCM = 0x0064
G722_ADPCM = 0x0065
DSAT = 0x0066
DSAT_DISPLAY = 0x0067
VOXWARE_BYTE_ALIGNED = 0x0069
VOXWARE_AC8 = 0x0070
VOXWARE_AC10 = 0x0071
VOXWARE_AC16 = 0x0072
VOXWARE_AC20 = 0x0073
VOXWARE_RT24 = 0x0074
VOXWARE_RT29 = 0x0075
VOXWARE_RT29HW = 0x0076
VOXWARE_VR12 = 0x0077
VOXWARE_VR18 = 0x0078
VOXWARE_TQ40 = 0x0079
VOXWARE_SC3 = 0x007A
VOXWARE_SC3_1 = 0x007B
SOFTSOUND = 0x0080
VOXWARE_TQ60 = 0x0081
MSRT24 = 0x0082
G729A = 0x0083
MVI_MVI2 = 0x0084
DF_G726 = 0x0085
DF_GSM610 = 0x0086
ISIAUDIO = 0x0088
ONLIVE = 0x0089
MULTITUDE_FT_SX20 = 0x008A
INFOCOM_ITS_G721_ADPCM = 0x008B
CONVEDIA_G729 = 0x008C
CONGRUENCY = 0x008D
SBC24 = 0x0091
DOLBY_AC3_SPDIF = 0x0092
MEDIASONIC_G723 = 0x0093
PROSODY_8KBPS = 0x0094
ZYXEL_ADPCM = 0x0097
PHILIPS_LPCBB = 0x0098
PACKED = 0x0099
MALDEN_PHONYTALK = 0x00A0
RACAL_RECORDER_GSM = 0x00A1
RACAL_RECORDER_G720_A = 0x00A2
RACAL_RECORDER_G723_1 = 0x00A3
RACAL_RECORDER_TETRA_ACELP = 0x00A4
NEC_AAC = 0x00B0
RAW_AAC1 = 0x00FF
RHETOREX_ADPCM = 0x0100
IRAT = 0x0101
VIVO_G723 = 0x0111
VIVO_SIREN = 0x0112
PHILIPS_CELP = 0x0120
PHILIPS_GRUNDIG = 0x0121
DIGITAL_G723 = 0x0123
SANYO_LD_ADPCM = 0x0125
SIPROLAB_ACEPLNET = 0x0130
SIPROLAB_ACELP4800 = 0x0131
SIPROLAB_ACELP8V3 = 0x0132
SIPROLAB_G729 = 0x0133
SIPROLAB_G729A = 0x0134
SIPROLAB_KELVIN = 0x0135
VOICEAGE_AMR = 0x0136
G726ADPCM = 0x0140
DICTAPHONE_CELP68 = 0x0141
DICTAPHONE_CELP54 = 0x0142
QUALCOMM_PUREVOICE = 0x0150
QUALCOMM_HALFRATE = 0x0151
TUBGSM = 0x0155
MSAUDIO1 = 0x0160
WMAUDIO2 = 0x0161
WMAUDIO3 = 0x0162
WMAUDIO_LOSSLESS = 0x0163
WMASPDIF = 0x0164
UNISYS_NAP_ADPCM = 0x0170
UNISYS_NAP_ULAW = 0x0171
UNISYS_NAP_ALAW = 0x0172
UNISYS_NAP_16K = 0x0173
SYCOM_ACM_SYC008 = 0x0174
SYCOM_ACM_SYC701_G726L = 0x0175
SYCOM_ACM_SYC701_CELP54 = 0x0176
SYCOM_ACM_SYC701_CELP68 = 0x0177
KNOWLEDGE_ADVENTURE_ADPCM = 0x0178
FRAUNHOFER_IIS_MPEG2_AAC = 0x0180
DTS_DS = 0x0190
CREATIVE_ADPCM = 0x0200
CREATIVE_FASTSPEECH8 = 0x0202
CREATIVE_FASTSPEECH10 = 0x0203
UHER_ADPCM = 0x0210
ULEAD_DV_AUDIO = 0x0215
ULEAD_DV_AUDIO_1 = 0x0216
QUARTERDECK = 0x0220
ILINK_VC = 0x0230
RAW_SPORT = 0x0240
ESST_AC3 = 0x0241
GENERIC_PASSTHRU = 0x0249
IPI_HSX = 0x0250
IPI_RPELP = 0x0251
CS2 = 0x0260
SONY_SCX = 0x0270
SONY_SCY = 0x0271
SONY_ATRAC3 = 0x0272
SONY_SPC = 0x0273
TELUM_AUDIO = 0x0280
TELUM_IA_AUDIO = 0x0281
NORCOM_VOICE_SYSTEMS_ADPCM = 0x0285
FM_TOWNS_SND = 0x0300
MICRONAS = 0x0350
MICRONAS_CELP833 = 0x0351
BTV_DIGITAL = 0x0400
INTEL_MUSIC_CODER = 0x0401
INDEO_AUDIO = 0x0402
QDESIGN_MUSIC = 0x0450
ON2_VP7_AUDIO = 0x0500
ON2_VP6_AUDIO = 0x0501
VME_VMPCM = 0x0680
TPC = 0x0681
LIGHTWAVE_LOSSLESS = 0x08AE
OLIGSM = 0x1000
OLIADPCM = 0x1001
OLICELP = 0x1002
OLISBC = 0x1003
OLIOPR = 0x1004
LH_CODEC = 0x1100
LH_CODEC_CELP = 0x1101
LH_CODEC_SBC8 = 0x1102
LH_CODEC_SBC12 = 0x1103
LH_CODEC_SBC16 = 0x1104
NORRIS = 0x1400
ISIAUDIO_2 = 0x1401
SOUNDSPACE_MUSICOMPRESS = 0x1500
MPEG_ADTS_AAC = 0x1600
MPEG_RAW_AAC = 0x1601
MPEG_LOAS = 0x1602
NOKIA_MPEG_ADTS_AAC = 0x1608
NOKIA_MPEG_RAW_AAC = 0x1609
VODAFONE_MPEG_ADTS_AAC = 0x160A
VODAFONE_MPEG_RAW_AAC = 0x160B
MPEG_HEAAC = 0x1610
VOXWARE_RT24_SPEECH = 0x181C
SONICFOUNDRY_LOSSLESS = 0x1971
INNINGS_TELECOM_ADPCM = 0x1979
LUCENT_SX8300P = 0x1C07
LUCENT_SX5363S = 0x1C0C
CUSEEME = 0x1F03
NTCSOFT_ALF2CM_ACM = 0x1FC4
DVM = 0x2000
DTS2 = 0x2001
MAKEAVIS = 0x3313
DIVIO_MPEG4_AAC = 0x4143
NOKIA_ADAPTIVE_MULTIRATE = 0x4201
DIVIO_G726 = 0x4243
LEAD_SPEECH = 0x434C
LEAD_VORBIS = 0x564C
WAVPACK_AUDIO = 0x5756
OGG_VORBIS_MODE_1 = 0x674F
OGG_VORBIS_MODE_2 = 0x6750
OGG_VORBIS_MODE_3 = 0x6751
OGG_VORBIS_MODE_1_PLUS = 0x676F
OGG_VORBIS_MODE_2_PLUS = 0x6770
OGG_VORBIS_MODE_3_PLUS = 0x6771
ALAC = 0x6C61
_3COM_NBX = 0x7000 # Can't have leading digit
OPUS = 0x704F
FAAD_AAC = 0x706D
AMR_NB = 0x7361
AMR_WB = 0x7362
AMR_WP = 0x7363
GSM_AMR_CBR = 0x7A21
GSM_AMR_VBR_SID = 0x7A22
COMVERSE_INFOSYS_G723_1 = 0xA100
COMVERSE_INFOSYS_AVQSBC = 0xA101
COMVERSE_INFOSYS_SBC = 0xA102
SYMBOL_G729_A = 0xA103
VOICEAGE_AMR_WB = 0xA104
INGENIENT_G726 = 0xA105
MPEG4_AAC = 0xA106
ENCORE_G726 = 0xA107
ZOLL_ASAO = 0xA108
SPEEX_VOICE = 0xA109
VIANIX_MASC = 0xA10A
WM9_SPECTRUM_ANALYZER = 0xA10B
WMF_SPECTRUM_ANAYZER = 0xA10C
GSM_610 = 0xA10D
GSM_620 = 0xA10E
GSM_660 = 0xA10F
GSM_690 = 0xA110
GSM_ADAPTIVE_MULTIRATE_WB = 0xA111
POLYCOM_G722 = 0xA112
POLYCOM_G728 = 0xA113
POLYCOM_G729_A = 0xA114
POLYCOM_SIREN = 0xA115
GLOBAL_IP_ILBC = 0xA116
RADIOTIME_TIME_SHIFT_RADIO = 0xA117
NICE_ACA = 0xA118
NICE_ADPCM = 0xA119
VOCORD_G721 = 0xA11A
VOCORD_G726 = 0xA11B
VOCORD_G722_1 = 0xA11C
VOCORD_G728 = 0xA11D
VOCORD_G729 = 0xA11E
VOCORD_G729_A = 0xA11F
VOCORD_G723_1 = 0xA120
VOCORD_LBC = 0xA121
NICE_G728 = 0xA122
FRACE_TELECOM_G729 = 0xA123
CODIAN = 0xA124
FLAC = 0xF1AC
EXTENSIBLE = 0xFFFE
DEVELOPMENT = 0xFFFF
KNOWN_WAVE_FORMATS = {WAVE_FORMAT.PCM, WAVE_FORMAT.IEEE_FLOAT}
def _raise_bad_format(format_tag):
try:
format_name = WAVE_FORMAT(format_tag).name
except ValueError:
format_name = f"{format_tag:#06x}"
raise ValueError(
f"Unknown wave file format: {format_name}. Supported "
"formats: " + ", ".join(x.name for x in KNOWN_WAVE_FORMATS)
)
def _read_fmt_chunk(fid, is_big_endian):
"""
Returns
-------
size : int
size of format subchunk in bytes (minus 8 for "fmt " and itself)
format_tag : int
PCM, float, or compressed format
channels : int
number of channels
fs : int
sampling frequency in samples per second
bytes_per_second : int
overall byte rate for the file
block_align : int
bytes per sample, including all channels
bit_depth : int
bits per sample
Notes
-----
Assumes file pointer is immediately after the 'fmt ' id
"""
if is_big_endian:
fmt = ">"
else:
fmt = "<"
size = struct.unpack(fmt + "I", fid.read(4))[0]
if size < 16:
raise ValueError("Binary structure of wave file is not compliant")
res = struct.unpack(fmt + "HHIIHH", fid.read(16))
bytes_read = 16
format_tag, channels, fs, bytes_per_second, block_align, bit_depth = res
if format_tag == WAVE_FORMAT.EXTENSIBLE and size >= (16 + 2):
ext_chunk_size = struct.unpack(fmt + "H", fid.read(2))[0]
bytes_read += 2
if ext_chunk_size >= 22:
extensible_chunk_data = fid.read(22)
bytes_read += 22
raw_guid = extensible_chunk_data[2 + 4 : 2 + 4 + 16]
# GUID template {XXXXXXXX-0000-0010-8000-00AA00389B71} (RFC-2361)
# MS GUID byte order: first three groups are native byte order,
# rest is Big Endian
if is_big_endian:
tail = b"\x00\x00\x00\x10\x80\x00\x00\xAA\x00\x38\x9B\x71"
else:
tail = b"\x00\x00\x10\x00\x80\x00\x00\xAA\x00\x38\x9B\x71"
if raw_guid.endswith(tail):
format_tag = struct.unpack(fmt + "I", raw_guid[:4])[0]
else:
raise ValueError("Binary structure of wave file is not compliant")
if format_tag not in KNOWN_WAVE_FORMATS:
_raise_bad_format(format_tag)
# move file pointer to next chunk
if size > bytes_read:
fid.read(size - bytes_read)
# fmt should always be 16, 18 or 40, but handle it just in case
_handle_pad_byte(fid, size)
return (size, format_tag, channels, fs, bytes_per_second, block_align, bit_depth)
def _read_data_chunk(
fid, format_tag, channels, bit_depth, is_big_endian, block_align, mmap=False
):
"""
Notes
-----
Assumes file pointer is immediately after the 'data' id
It's possible to not use all available bits in a container, or to store
samples in a container bigger than necessary, so bytes_per_sample uses
the actual reported container size (nBlockAlign / nChannels). Real-world
examples:
Adobe Audition's "24-bit packed int (type 1, 20-bit)"
nChannels = 2, nBlockAlign = 6, wBitsPerSample = 20
http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples/AFsp/M1F1-int12-AFsp.wav
is:
nChannels = 2, nBlockAlign = 4, wBitsPerSample = 12
http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Docs/multichaudP.pdf
gives an example of:
nChannels = 2, nBlockAlign = 8, wBitsPerSample = 20
"""
if is_big_endian:
fmt = ">"
else:
fmt = "<"
# Size of the data subchunk in bytes
size = struct.unpack(fmt + "I", fid.read(4))[0]
# Number of bytes per sample (sample container size)
bytes_per_sample = block_align // channels
n_samples = size // bytes_per_sample
if format_tag == WAVE_FORMAT.PCM:
if 1 <= bit_depth <= 8:
dtype = "u1" # WAV of 8-bit integer or less are unsigned
elif bytes_per_sample in {3, 5, 6, 7}:
# No compatible dtype. Load as raw bytes for reshaping later.
dtype = "V1"
elif bit_depth <= 64:
# Remaining bit depths can map directly to signed numpy dtypes
dtype = f"{fmt}i{bytes_per_sample}"
else:
raise ValueError(
"Unsupported bit depth: the WAV file "
f"has {bit_depth}-bit integer data."
)
elif format_tag == WAVE_FORMAT.IEEE_FLOAT:
if bit_depth in {32, 64}:
dtype = f"{fmt}f{bytes_per_sample}"
else:
raise ValueError(
"Unsupported bit depth: the WAV file "
f"has {bit_depth}-bit floating-point data."
)
else:
_raise_bad_format(format_tag)
start = fid.tell()
if not mmap:
try:
count = size if dtype == "V1" else n_samples
data = numpy.fromfile(fid, dtype=dtype, count=count)
except io.UnsupportedOperation: # not a C-like file
fid.seek(start, 0) # just in case it seeked, though it shouldn't
data = numpy.frombuffer(fid.read(size), dtype=dtype)
if dtype == "V1":
# Rearrange raw bytes into smallest compatible numpy dtype
dt = f"{fmt}i4" if bytes_per_sample == 3 else f"{fmt}i8"
a = numpy.zeros(
(len(data) // bytes_per_sample, numpy.dtype(dt).itemsize), dtype="V1"
)
if is_big_endian:
a[:, :bytes_per_sample] = data.reshape((-1, bytes_per_sample))
else:
a[:, -bytes_per_sample:] = data.reshape((-1, bytes_per_sample))
data = a.view(dt).reshape(a.shape[:-1])
else:
if bytes_per_sample in {1, 2, 4, 8}:
start = fid.tell()
data = numpy.memmap(
fid, dtype=dtype, mode="c", offset=start, shape=(n_samples,)
)
fid.seek(start + size)
else:
raise ValueError(
"mmap=True not compatible with "
f"{bytes_per_sample}-byte container size."
)
_handle_pad_byte(fid, size)
if channels > 1:
data = data.reshape(-1, channels)
return data
def _skip_unknown_chunk(fid, is_big_endian):
if is_big_endian:
fmt = ">I"
else:
fmt = "<I"
data = fid.read(4)
# call unpack() and seek() only if we have really read data from file
# otherwise empty read at the end of the file would trigger
# unnecessary exception at unpack() call
# in case data equals somehow to 0, there is no need for seek() anyway
if data:
size = struct.unpack(fmt, data)[0]
fid.seek(size, 1)
_handle_pad_byte(fid, size)
def _read_riff_chunk(fid):
str1 = fid.read(4) # File signature
if str1 == b"RIFF":
is_big_endian = False
fmt = "<I"
elif str1 == b"RIFX":
is_big_endian = True
fmt = ">I"
else:
# There are also .wav files with "FFIR" or "XFIR" signatures?
raise ValueError(
f"File format {repr(str1)} not understood. Only "
"'RIFF' and 'RIFX' supported."
)
# Size of entire file
file_size = struct.unpack(fmt, fid.read(4))[0] + 8
str2 = fid.read(4)
if str2 != b"WAVE":
raise ValueError(f"Not a WAV file. RIFF form type is {repr(str2)}.")
return file_size, is_big_endian
def _handle_pad_byte(fid, size):
# "If the chunk size is an odd number of bytes, a pad byte with value zero
# is written after ckData." So we need to seek past this after each chunk.
if size % 2:
fid.seek(1, 1)
def read(filename, mmap=False):
"""
Open a WAV file.
Return the sample rate (in samples/sec) and data from an LPCM WAV file.
Parameters
----------
filename : string or open file handle
Input WAV file.
mmap : bool, optional
Whether to read data as memory-mapped (default: False). Not compatible
with some bit depths; see Notes. Only to be used on real files.
.. versionadded:: 0.12.0
Returns
-------
rate : int
Sample rate of WAV file.
data : numpy array
Data read from WAV file. Data-type is determined from the file;
see Notes. Data is 1-D for 1-channel WAV, or 2-D of shape
(Nsamples, Nchannels) otherwise. If a file-like input without a
C-like file descriptor (e.g., :class:`python:io.BytesIO`) is
passed, this will not be writeable.
Notes
-----
Common data types: [1]_
===================== =========== =========== =============
WAV format Min Max NumPy dtype
===================== =========== =========== =============
32-bit floating-point -1.0 +1.0 float32
32-bit integer PCM -2147483648 +2147483647 int32
24-bit integer PCM -2147483648 +2147483392 int32
16-bit integer PCM -32768 +32767 int16
8-bit integer PCM 0 255 uint8
===================== =========== =========== =============
WAV files can specify arbitrary bit depth, and this function supports
reading any integer PCM depth from 1 to 64 bits. Data is returned in the
smallest compatible numpy int type, in left-justified format. 8-bit and
lower is unsigned, while 9-bit and higher is signed.
For example, 24-bit data will be stored as int32, with the MSB of the
24-bit data stored at the MSB of the int32, and typically the least
significant byte is 0x00. (However, if a file actually contains data past
its specified bit depth, those bits will be read and output, too. [2]_)
This bit justification and sign matches WAV's native internal format, which
allows memory mapping of WAV files that use 1, 2, 4, or 8 bytes per sample
(so 24-bit files cannot be memory-mapped, but 32-bit can).
IEEE float PCM in 32- or 64-bit format is supported, with or without mmap.
Values exceeding [-1, +1] are not clipped.
Non-linear PCM (mu-law, A-law) is not supported.
References
----------
.. [1] IBM Corporation and Microsoft Corporation, "Multimedia Programming
Interface and Data Specifications 1.0", section "Data Format of the
Samples", August 1991
http://www.tactilemedia.com/info/MCI_Control_Info.html
.. [2] Adobe Systems Incorporated, "Adobe Audition 3 User Guide", section
"Audio file formats: 24-bit Packed Int (type 1, 20-bit)", 2007
Examples
--------
>>> from os.path import dirname, join as pjoin
>>> from scipy.io import wavfile
>>> import scipy.io
Get the filename for an example .wav file from the tests/data directory.
>>> data_dir = pjoin(dirname(scipy.io.__file__), 'tests', 'data')
>>> wav_fname = pjoin(data_dir, 'test-44100Hz-2ch-32bit-float-be.wav')
Load the .wav file contents.
>>> samplerate, data = wavfile.read(wav_fname)
>>> print(f"number of channels = {data.shape[1]}")
number of channels = 2
>>> length = data.shape[0] / samplerate
>>> print(f"length = {length}s")
length = 0.01s
Plot the waveform.
>>> import matplotlib.pyplot as plt
>>> import numpy as np
>>> time = np.linspace(0., length, data.shape[0])
>>> plt.plot(time, data[:, 0], label="Left channel")
>>> plt.plot(time, data[:, 1], label="Right channel")
>>> plt.legend()
>>> plt.xlabel("Time [s]")
>>> plt.ylabel("Amplitude")
>>> plt.show()
"""
if hasattr(filename, "read"):
fid = filename
mmap = False
else:
# pylint: disable=consider-using-with
fid = open(filename, "rb")
try:
file_size, is_big_endian = _read_riff_chunk(fid)
fmt_chunk_received = False
data_chunk_received = False
while fid.tell() < file_size:
# read the next chunk
chunk_id = fid.read(4)
if not chunk_id:
if data_chunk_received:
# End of file but data successfully read
warnings.warn(
f"Reached EOF prematurely; finished at {fid.tell()} bytes, "
"expected {file_size} bytes from header.",
WavFileWarning,
stacklevel=2,
)
break
raise ValueError("Unexpected end of file.")
if len(chunk_id) < 4:
msg = f"Incomplete chunk ID: {repr(chunk_id)}"
# If we have the data, ignore the broken chunk
if fmt_chunk_received and data_chunk_received:
warnings.warn(msg + ", ignoring it.", WavFileWarning, stacklevel=2)
else:
raise ValueError(msg)
if chunk_id == b"fmt ":
fmt_chunk_received = True
fmt_chunk = _read_fmt_chunk(fid, is_big_endian)
format_tag, channels, fs = fmt_chunk[1:4]
bit_depth = fmt_chunk[6]
block_align = fmt_chunk[5]
elif chunk_id == b"fact":
_skip_unknown_chunk(fid, is_big_endian)
elif chunk_id == b"data":
data_chunk_received = True
if not fmt_chunk_received:
raise ValueError("No fmt chunk before data")
data = _read_data_chunk(
fid,
format_tag,
channels,
bit_depth,
is_big_endian,
block_align,
mmap,
)
elif chunk_id == b"LIST":
# Someday this could be handled properly but for now skip it
_skip_unknown_chunk(fid, is_big_endian)
elif chunk_id in {b"JUNK", b"Fake"}:
# Skip alignment chunks without warning
_skip_unknown_chunk(fid, is_big_endian)
else:
warnings.warn(
"Chunk (non-data) not understood, skipping it.",
WavFileWarning,
stacklevel=2,
)
_skip_unknown_chunk(fid, is_big_endian)
finally:
if not hasattr(filename, "read"):
fid.close()
else:
fid.seek(0)
return fs, data
def write(filename, rate, data):
"""
Write a NumPy array as a WAV file.
Parameters
----------
filename : string or open file handle
Output wav file.
rate : int
The sample rate (in samples/sec).
data : ndarray
A 1-D or 2-D NumPy array of either integer or float data-type.
Notes
-----
* Writes a simple uncompressed WAV file.
* To write multiple-channels, use a 2-D array of shape
(Nsamples, Nchannels).
* The bits-per-sample and PCM/float will be determined by the data-type.
Common data types: [1]_
===================== =========== =========== =============
WAV format Min Max NumPy dtype
===================== =========== =========== =============
32-bit floating-point -1.0 +1.0 float32
32-bit PCM -2147483648 +2147483647 int32
16-bit PCM -32768 +32767 int16
8-bit PCM 0 255 uint8
===================== =========== =========== =============
Note that 8-bit PCM is unsigned.
References
----------
.. [1] IBM Corporation and Microsoft Corporation, "Multimedia Programming
Interface and Data Specifications 1.0", section "Data Format of the
Samples", August 1991
http://www.tactilemedia.com/info/MCI_Control_Info.html
Examples
--------
Create a 100Hz sine wave, sampled at 44100Hz.
Write to 16-bit PCM, Mono.
>>> from scipy.io.wavfile import write
>>> samplerate = 44100; fs = 100
>>> t = np.linspace(0., 1., samplerate)
>>> amplitude = np.iinfo(np.int16).max
>>> data = amplitude * np.sin(2. * np.pi * fs * t)
>>> write("example.wav", samplerate, data.astype(np.int16))
"""
if hasattr(filename, "write"):
fid = filename
else:
# pylint: disable=consider-using-with
fid = open(filename, "wb")
fs = rate
try:
dkind = data.dtype.kind
if not (
dkind == "i" or dkind == "f" or (dkind == "u" and data.dtype.itemsize == 1)
):
raise ValueError(f"Unsupported data type '{data.dtype}'")
header_data = b""
header_data += b"RIFF"
header_data += b"\x00\x00\x00\x00"
header_data += b"WAVE"
# fmt chunk
header_data += b"fmt "
if dkind == "f":
format_tag = WAVE_FORMAT.IEEE_FLOAT
else:
format_tag = WAVE_FORMAT.PCM
if data.ndim == 1:
channels = 1
else:
channels = data.shape[1]
bit_depth = data.dtype.itemsize * 8
bytes_per_second = fs * (bit_depth // 8) * channels
block_align = channels * (bit_depth // 8)
fmt_chunk_data = struct.pack(
"<HHIIHH",
format_tag,
channels,
fs,
bytes_per_second,
block_align,
bit_depth,
)
if not (dkind in ("i", "u")):
# add cbSize field for non-PCM files
fmt_chunk_data += b"\x00\x00"
header_data += struct.pack("<I", len(fmt_chunk_data))
header_data += fmt_chunk_data
# fact chunk (non-PCM files)
if not (dkind in ("i", "u")):
header_data += b"fact"
header_data += struct.pack("<II", 4, data.shape[0])
# check data size (needs to be immediately before the data chunk)
if ((len(header_data) - 4 - 4) + (4 + 4 + data.nbytes)) > 0xFFFFFFFF:
raise ValueError("Data exceeds wave file size limit")
fid.write(header_data)
# data chunk
fid.write(b"data")
fid.write(struct.pack("<I", data.nbytes))
if data.dtype.byteorder == ">" or (
data.dtype.byteorder == "=" and sys.byteorder == "big"
):
data = data.byteswap()
_array_tofile(fid, data)
# Determine file size and place it in correct
# position at start of the file.
size = fid.tell()
fid.seek(4)
fid.write(struct.pack("<I", size - 8))
finally:
if not hasattr(filename, "write"):
fid.close()
else:
fid.seek(0)
def _array_tofile(fid, data):
# ravel gives a c-contiguous buffer
fid.write(data.ravel().view("b").data)

View File

@@ -0,0 +1,111 @@
#!/usr/bin/env python3
import argparse
import logging
import time
from pathlib import Path
import librosa
import torch
from .vits.lightning import VitsModel
from .vits.mel_processing import spectrogram_torch
from .vits.wavfile import write as write_wav
_LOGGER = logging.getLogger("piper_train.voice_converstion")
def main():
"""Main entry point"""
parser = argparse.ArgumentParser()
parser.add_argument("audio", nargs="+", help="Audio file(s) to convert")
parser.add_argument("--checkpoint", required=True, help="Path to model checkpoint")
parser.add_argument(
"--output-dir",
help="Directory to write WAV file(s) (default: current directory)",
)
parser.add_argument(
"--from-speaker", required=True, type=int, help="Speaker id number of source"
)
parser.add_argument(
"--to-speaker", required=True, type=int, help="Speaker id number of target"
)
parser.add_argument(
"--debug", action="store_true", help="Print DEBUG messages to the console"
)
args = parser.parse_args()
if args.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
_LOGGER.debug(args)
# -------------------------------------------------------------------------
args.checkpoint = Path(args.checkpoint)
args.output_dir = Path(args.output_dir) if args.output_dir else Path.cwd()
args.output_dir.parent.mkdir(parents=True, exist_ok=True)
model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
model_g = model.model_g
# Inference only
model_g.eval()
with torch.no_grad():
model_g.dec.remove_weight_norm()
try:
for audio_path_str in args.audio:
audio_path = Path(audio_path_str)
wav_path = args.output_dir / f"{audio_path.stem}.wav"
audio, _sample_rate = librosa.load(path=audio_path_str, sr=22050)
with torch.no_grad():
# NOTE: audio is already in [-1, 1] coming from librosa
audio_norm = torch.FloatTensor(audio).unsqueeze(0)
spec = spectrogram_torch(
y=audio_norm,
n_fft=1024,
sampling_rate=22050,
hop_size=256,
win_size=1024,
center=False,
).squeeze(0)
specs = spec.unsqueeze(0)
spec_lengths = torch.LongTensor([specs.shape[2]])
from_speaker = torch.LongTensor([args.from_speaker])
to_speaker = torch.LongTensor([args.to_speaker])
start_time = time.perf_counter()
audio = (
model_g.voice_conversion(
specs, spec_lengths, from_speaker, to_speaker
)[0][0, 0]
.data.cpu()
.float()
.numpy()
)
end_time = time.perf_counter()
_LOGGER.debug(
"Converted audio in %s second(s) (%s, shape=%s)",
end_time - start_time,
audio_path.stem,
list(audio.shape),
)
write_wav(str(wav_path), 22050, audio)
_LOGGER.info("Wrote WAV to %s", wav_path)
except KeyboardInterrupt:
pass
# -----------------------------------------------------------------------------
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,7 @@
cython>=0.29.0,<1
piper-phonemize~=1.1.0
librosa>=0.9.2,<1
numpy>=1.19.0
onnxruntime>=1.11.0
pytorch-lightning
# torch>=1.11.0,<2

View File

@@ -0,0 +1,7 @@
black==22.3.0
coverage==5.0.4
flake8==3.7.9
mypy==0.910
pylint==2.10.2
pytest==5.4.1
pytest-cov==2.8.1

View File

@@ -0,0 +1,15 @@
#!/usr/bin/env bash
# Follow instructions here: https://docs.docker.com/config/containers/resource_constraints/#access-an-nvidia-gpu
docker run \
-it \
--gpus all \
-w "$PWD" \
--user "$(id -u):$(id -g)" \
--ipc=host \
-v "${HOME}:${HOME}" \
-v /media/cache:/media/cache:ro \
-v /etc/hostname:/etc/hostname:ro \
-v /etc/localtime:/etc/localtime:ro \
larynx2-train \
"$@"

View File

@@ -0,0 +1,29 @@
#!/usr/bin/env bash
# Runs formatters, linters, and type checkers on Python code.
set -eo pipefail
# Directory of *this* script
this_dir="$( cd "$( dirname "$0" )" && pwd )"
base_dir="$(realpath "${this_dir}/..")"
# Path to virtual environment
: "${venv:=${base_dir}/.venv}"
if [ -d "${venv}" ]; then
# Activate virtual environment if available
source "${venv}/bin/activate"
fi
python_files=("${base_dir}/piper_train")
# Format code
black "${python_files[@]}"
isort "${python_files[@]}"
# Check
flake8 "${python_files[@]}"
pylint "${python_files[@]}"
mypy "${python_files[@]}"

View File

@@ -0,0 +1,33 @@
#!/usr/bin/env bash
set -eo pipefail
# Directory of *this* script
this_dir="$( cd "$( dirname "$0" )" && pwd )"
# Base directory of repo
base_dir="$(realpath "${this_dir}/..")"
# Path to virtual environment
: "${venv:=${base_dir}/.venv}"
# Python binary to use
: "${PYTHON=python3}"
python_version="$(${PYTHON} --version)"
# Create virtual environment
echo "Creating virtual environment at ${venv} (${python_version})"
rm -rf "${venv}"
"${PYTHON}" -m venv "${venv}"
source "${venv}/bin/activate"
# Install Python dependencies
echo 'Installing Python dependencies'
pip3 install --upgrade pip
pip3 install --upgrade wheel setuptools
pip3 install -r "${base_dir}/requirements.txt"
# -----------------------------------------------------------------------------
echo "OK"

View File

@@ -0,0 +1,61 @@
#!/usr/bin/env python3
from collections import defaultdict
from pathlib import Path
import setuptools
from setuptools import setup
this_dir = Path(__file__).parent
module_dir = this_dir / "piper_train"
# -----------------------------------------------------------------------------
# Load README in as long description
long_description: str = ""
readme_path = this_dir / "README.md"
if readme_path.is_file():
long_description = readme_path.read_text(encoding="utf-8")
requirements = []
requirements_path = this_dir / "requirements.txt"
if requirements_path.is_file():
with open(requirements_path, "r", encoding="utf-8") as requirements_file:
requirements = requirements_file.read().splitlines()
version_path = module_dir / "VERSION"
with open(version_path, "r", encoding="utf-8") as version_file:
version = version_file.read().strip()
# -----------------------------------------------------------------------------
setup(
name="piper_train",
version=version,
description="A fast and local neural text to speech system",
long_description=long_description,
url="http://github.com/rhasspy/piper",
author="Michael Hansen",
author_email="mike@rhasspy.org",
license="MIT",
packages=setuptools.find_packages(),
package_data={
"piper_train": ["VERSION", "py.typed"],
},
install_requires=requirements,
extras_require={':python_version<"3.9"': ["importlib_resources"]},
entry_points={
"console_scripts": [
"piper-train = piper_train.__main__:main",
]
},
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"Topic :: Text Processing :: Linguistic",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
],
keywords="rhasspy tts speech voice",
)

View File

@@ -0,0 +1,3 @@
build/
dist/
*.egg-info/

View File

@@ -0,0 +1,6 @@
[settings]
multi_line_output=3
include_trailing_comma=True
force_grid_wrap=0
use_parentheses=True
line_length=88

View File

@@ -0,0 +1,2 @@
include requirements.txt
include piper/voices.json

View File

@@ -0,0 +1,27 @@
# Piper HTTP Server
Install the requirements into your virtual environment:
```sh
.venv/bin/pip3 install -r requirements_http.txt
```
Run the web server:
```sh
.venv/bin/python3 -m piper.http_server --model ...
```
See `--help` for more options.
Using a `GET` request:
```sh
curl -G --data-urlencode 'text=This is a test.' -o test.wav 'localhost:5000'
```
Using a `POST` request:
```sh
curl -X POST -H 'Content-Type: text/plain' --data 'This is a test.' -o test.wav 'localhost:5000'
```

View File

@@ -0,0 +1,7 @@
[mypy]
[mypy-onnxruntime.*]
ignore_missing_imports = True
[mypy-piper_phonemize.*]
ignore_missing_imports = True

View File

@@ -0,0 +1,5 @@
from .voice import PiperVoice
__all__ = [
"PiperVoice",
]

View File

@@ -0,0 +1,159 @@
import argparse
import logging
import sys
import time
import wave
from pathlib import Path
from typing import Any, Dict
from . import PiperVoice
from .download import ensure_voice_exists, find_voice, get_voices
_FILE = Path(__file__)
_DIR = _FILE.parent
_LOGGER = logging.getLogger(_FILE.stem)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
parser.add_argument("-c", "--config", help="Path to model config file")
parser.add_argument(
"-f",
"--output-file",
"--output_file",
help="Path to output WAV file (default: stdout)",
)
parser.add_argument(
"-d",
"--output-dir",
"--output_dir",
help="Path to output directory (default: cwd)",
)
parser.add_argument(
"--output-raw",
"--output_raw",
action="store_true",
help="Stream raw audio to stdout",
)
#
parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
parser.add_argument(
"--length-scale", "--length_scale", type=float, help="Phoneme length"
)
parser.add_argument(
"--noise-scale", "--noise_scale", type=float, help="Generator noise"
)
parser.add_argument(
"--noise-w", "--noise_w", type=float, help="Phoneme width noise"
)
#
parser.add_argument("--cuda", action="store_true", help="Use GPU")
#
parser.add_argument(
"--sentence-silence",
"--sentence_silence",
type=float,
default=0.0,
help="Seconds of silence after each sentence",
)
#
parser.add_argument(
"--data-dir",
"--data_dir",
action="append",
default=[str(Path.cwd())],
help="Data directory to check for downloaded models (default: current directory)",
)
parser.add_argument(
"--download-dir",
"--download_dir",
help="Directory to download voices into (default: first data dir)",
)
#
parser.add_argument(
"--update-voices",
action="store_true",
help="Download latest voices.json during startup",
)
#
parser.add_argument(
"--debug", action="store_true", help="Print DEBUG messages to console"
)
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
_LOGGER.debug(args)
if not args.download_dir:
# Download to first data directory by default
args.download_dir = args.data_dir[0]
# Download voice if file doesn't exist
model_path = Path(args.model)
if not model_path.exists():
# Load voice info
voices_info = get_voices(args.download_dir, update_voices=args.update_voices)
# Resolve aliases for backwards compatibility with old voice names
aliases_info: Dict[str, Any] = {}
for voice_info in voices_info.values():
for voice_alias in voice_info.get("aliases", []):
aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
voices_info.update(aliases_info)
ensure_voice_exists(args.model, args.data_dir, args.download_dir, voices_info)
args.model, args.config = find_voice(args.model, args.data_dir)
# Load voice
voice = PiperVoice.load(args.model, config_path=args.config, use_cuda=args.cuda)
synthesize_args = {
"speaker_id": args.speaker,
"length_scale": args.length_scale,
"noise_scale": args.noise_scale,
"noise_w": args.noise_w,
"sentence_silence": args.sentence_silence,
}
if args.output_raw:
# Read line-by-line
for line in sys.stdin:
line = line.strip()
if not line:
continue
# Write raw audio to stdout as its produced
audio_stream = voice.synthesize_stream_raw(line, **synthesize_args)
for audio_bytes in audio_stream:
sys.stdout.buffer.write(audio_bytes)
sys.stdout.buffer.flush()
elif args.output_dir:
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Read line-by-line
for line in sys.stdin:
line = line.strip()
if not line:
continue
wav_path = output_dir / f"{time.monotonic_ns()}.wav"
with wave.open(str(wav_path), "wb") as wav_file:
voice.synthesize(line, wav_file, **synthesize_args)
_LOGGER.info("Wrote %s", wav_path)
else:
# Read entire input
text = sys.stdin.read()
if (not args.output_file) or (args.output_file == "-"):
# Write to stdout
with wave.open(sys.stdout.buffer, "wb") as wav_file:
voice.synthesize(text, wav_file, **synthesize_args)
else:
# Write to file
with wave.open(args.output_file, "wb") as wav_file:
voice.synthesize(text, wav_file, **synthesize_args)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,53 @@
"""Piper configuration"""
from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict, Mapping, Sequence
class PhonemeType(str, Enum):
ESPEAK = "espeak"
TEXT = "text"
@dataclass
class PiperConfig:
"""Piper configuration"""
num_symbols: int
"""Number of phonemes"""
num_speakers: int
"""Number of speakers"""
sample_rate: int
"""Sample rate of output audio"""
espeak_voice: str
"""Name of espeak-ng voice or alphabet"""
length_scale: float
noise_scale: float
noise_w: float
phoneme_id_map: Mapping[str, Sequence[int]]
"""Phoneme -> [id,]"""
phoneme_type: PhonemeType
"""espeak or text"""
@staticmethod
def from_dict(config: Dict[str, Any]) -> "PiperConfig":
inference = config.get("inference", {})
return PiperConfig(
num_symbols=config["num_symbols"],
num_speakers=config["num_speakers"],
sample_rate=config["audio"]["sample_rate"],
noise_scale=inference.get("noise_scale", 0.667),
length_scale=inference.get("length_scale", 1.0),
noise_w=inference.get("noise_w", 0.8),
#
espeak_voice=config["espeak"]["voice"],
phoneme_id_map=config["phoneme_id_map"],
phoneme_type=PhonemeType(config.get("phoneme_type", PhonemeType.ESPEAK)),
)

View File

@@ -0,0 +1,5 @@
"""Constants"""
PAD = "_" # padding (0)
BOS = "^" # beginning of sentence
EOS = "$" # end of sentence

View File

@@ -0,0 +1,139 @@
"""Utility for downloading Piper voices."""
import json
import logging
import shutil
from pathlib import Path
from typing import Any, Dict, Iterable, Set, Tuple, Union
from urllib.request import urlopen
from .file_hash import get_file_hash
URL_FORMAT = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/{file}"
_DIR = Path(__file__).parent
_LOGGER = logging.getLogger(__name__)
_SKIP_FILES = {"MODEL_CARD"}
class VoiceNotFoundError(Exception):
pass
def get_voices(
download_dir: Union[str, Path], update_voices: bool = False
) -> Dict[str, Any]:
"""Loads available voices from downloaded or embedded JSON file."""
download_dir = Path(download_dir)
voices_download = download_dir / "voices.json"
if update_voices:
# Download latest voices.json
voices_url = URL_FORMAT.format(file="voices.json")
_LOGGER.debug("Downloading %s to %s", voices_url, voices_download)
with urlopen(voices_url) as response, open(
voices_download, "wb"
) as download_file:
shutil.copyfileobj(response, download_file)
# Prefer downloaded file to embedded
voices_embedded = _DIR / "voices.json"
voices_path = voices_download if voices_download.exists() else voices_embedded
_LOGGER.debug("Loading %s", voices_path)
with open(voices_path, "r", encoding="utf-8") as voices_file:
return json.load(voices_file)
def ensure_voice_exists(
name: str,
data_dirs: Iterable[Union[str, Path]],
download_dir: Union[str, Path],
voices_info: Dict[str, Any],
):
assert data_dirs, "No data dirs"
if name not in voices_info:
raise VoiceNotFoundError(name)
voice_info = voices_info[name]
voice_files = voice_info["files"]
files_to_download: Set[str] = set()
for data_dir in data_dirs:
data_dir = Path(data_dir)
# Check sizes/hashes
for file_path, file_info in voice_files.items():
if file_path in files_to_download:
# Already planning to download
continue
file_name = Path(file_path).name
if file_name in _SKIP_FILES:
continue
data_file_path = data_dir / file_name
_LOGGER.debug("Checking %s", data_file_path)
if not data_file_path.exists():
_LOGGER.debug("Missing %s", data_file_path)
files_to_download.add(file_path)
continue
expected_size = file_info["size_bytes"]
actual_size = data_file_path.stat().st_size
if expected_size != actual_size:
_LOGGER.warning(
"Wrong size (expected=%s, actual=%s) for %s",
expected_size,
actual_size,
data_file_path,
)
files_to_download.add(file_path)
continue
expected_hash = file_info["md5_digest"]
actual_hash = get_file_hash(data_file_path)
if expected_hash != actual_hash:
_LOGGER.warning(
"Wrong hash (expected=%s, actual=%s) for %s",
expected_hash,
actual_hash,
data_file_path,
)
files_to_download.add(file_path)
continue
if (not voice_files) and (not files_to_download):
raise ValueError(f"Unable to find or download voice: {name}")
# Download missing files
download_dir = Path(download_dir)
for file_path in files_to_download:
file_name = Path(file_path).name
if file_name in _SKIP_FILES:
continue
file_url = URL_FORMAT.format(file=file_path)
download_file_path = download_dir / file_name
download_file_path.parent.mkdir(parents=True, exist_ok=True)
_LOGGER.debug("Downloading %s to %s", file_url, download_file_path)
with urlopen(file_url) as response, open(
download_file_path, "wb"
) as download_file:
shutil.copyfileobj(response, download_file)
_LOGGER.info("Downloaded %s (%s)", download_file_path, file_url)
def find_voice(name: str, data_dirs: Iterable[Union[str, Path]]) -> Tuple[Path, Path]:
for data_dir in data_dirs:
data_dir = Path(data_dir)
onnx_path = data_dir / f"{name}.onnx"
config_path = data_dir / f"{name}.onnx.json"
if onnx_path.exists() and config_path.exists():
return onnx_path, config_path
raise ValueError(f"Missing files for voice {name}")

View File

@@ -0,0 +1,46 @@
import argparse
import hashlib
import json
import sys
from pathlib import Path
from typing import Union
def get_file_hash(path: Union[str, Path], bytes_per_chunk: int = 8192) -> str:
"""Hash a file in chunks using md5."""
path_hash = hashlib.md5()
with open(path, "rb") as path_file:
chunk = path_file.read(bytes_per_chunk)
while chunk:
path_hash.update(chunk)
chunk = path_file.read(bytes_per_chunk)
return path_hash.hexdigest()
# -----------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser()
parser.add_argument("file", nargs="+")
parser.add_argument("--dir", help="Parent directory")
args = parser.parse_args()
if args.dir:
args.dir = Path(args.dir)
hashes = {}
for path_str in args.file:
path = Path(path_str)
path_hash = get_file_hash(path)
if args.dir:
path = path.relative_to(args.dir)
hashes[str(path)] = path_hash
json.dump(hashes, sys.stdout)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,127 @@
#!/usr/bin/env python3
import argparse
import io
import logging
import wave
from pathlib import Path
from typing import Any, Dict
from flask import Flask, request
from . import PiperVoice
from .download import ensure_voice_exists, find_voice, get_voices
_LOGGER = logging.getLogger()
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--host", default="0.0.0.0", help="HTTP server host")
parser.add_argument("--port", type=int, default=5000, help="HTTP server port")
#
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
parser.add_argument("-c", "--config", help="Path to model config file")
#
parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
parser.add_argument(
"--length-scale", "--length_scale", type=float, help="Phoneme length"
)
parser.add_argument(
"--noise-scale", "--noise_scale", type=float, help="Generator noise"
)
parser.add_argument(
"--noise-w", "--noise_w", type=float, help="Phoneme width noise"
)
#
parser.add_argument("--cuda", action="store_true", help="Use GPU")
#
parser.add_argument(
"--sentence-silence",
"--sentence_silence",
type=float,
default=0.0,
help="Seconds of silence after each sentence",
)
#
parser.add_argument(
"--data-dir",
"--data_dir",
action="append",
default=[str(Path.cwd())],
help="Data directory to check for downloaded models (default: current directory)",
)
parser.add_argument(
"--download-dir",
"--download_dir",
help="Directory to download voices into (default: first data dir)",
)
#
parser.add_argument(
"--update-voices",
action="store_true",
help="Download latest voices.json during startup",
)
#
parser.add_argument(
"--debug", action="store_true", help="Print DEBUG messages to console"
)
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
_LOGGER.debug(args)
if not args.download_dir:
# Download to first data directory by default
args.download_dir = args.data_dir[0]
# Download voice if file doesn't exist
model_path = Path(args.model)
if not model_path.exists():
# Load voice info
voices_info = get_voices(args.download_dir, update_voices=args.update_voices)
# Resolve aliases for backwards compatibility with old voice names
aliases_info: Dict[str, Any] = {}
for voice_info in voices_info.values():
for voice_alias in voice_info.get("aliases", []):
aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
voices_info.update(aliases_info)
ensure_voice_exists(args.model, args.data_dir, args.download_dir, voices_info)
args.model, args.config = find_voice(args.model, args.data_dir)
# Load voice
voice = PiperVoice.load(args.model, config_path=args.config, use_cuda=args.cuda)
synthesize_args = {
"speaker_id": args.speaker,
"length_scale": args.length_scale,
"noise_scale": args.noise_scale,
"noise_w": args.noise_w,
"sentence_silence": args.sentence_silence,
}
# Create web server
app = Flask(__name__)
@app.route("/", methods=["GET", "POST"])
def app_synthesize() -> bytes:
if request.method == "POST":
text = request.data.decode("utf-8")
else:
text = request.args.get("text", "")
text = text.strip()
if not text:
raise ValueError("No text provided")
_LOGGER.debug("Synthesizing text: %s", text)
with io.BytesIO() as wav_io:
with wave.open(wav_io, "wb") as wav_file:
voice.synthesize(text, wav_file, **synthesize_args)
return wav_io.getvalue()
app.run(host=args.host, port=args.port)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,12 @@
"""Utilities"""
import numpy as np
def audio_float_to_int16(
audio: np.ndarray, max_wav_value: float = 32767.0
) -> np.ndarray:
"""Normalize audio and convert to int16 range"""
audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
audio_norm = audio_norm.astype("int16")
return audio_norm

View File

@@ -0,0 +1,185 @@
import json
import logging
import wave
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
import onnxruntime
from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run
from .config import PhonemeType, PiperConfig
from .const import BOS, EOS, PAD
from .util import audio_float_to_int16
_LOGGER = logging.getLogger(__name__)
@dataclass
class PiperVoice:
session: onnxruntime.InferenceSession
config: PiperConfig
@staticmethod
def load(
model_path: Union[str, Path],
config_path: Optional[Union[str, Path]] = None,
use_cuda: bool = False,
) -> "PiperVoice":
"""Load an ONNX model and config."""
if config_path is None:
config_path = f"{model_path}.json"
with open(config_path, "r", encoding="utf-8") as config_file:
config_dict = json.load(config_file)
providers: List[Union[str, Tuple[str, Dict[str, Any]]]]
if use_cuda:
providers = [
(
"CUDAExecutionProvider",
{"cudnn_conv_algo_search": "HEURISTIC"},
)
]
else:
providers = ["CPUExecutionProvider"]
return PiperVoice(
config=PiperConfig.from_dict(config_dict),
session=onnxruntime.InferenceSession(
str(model_path),
sess_options=onnxruntime.SessionOptions(),
providers=providers,
),
)
def phonemize(self, text: str) -> List[List[str]]:
"""Text to phonemes grouped by sentence."""
if self.config.phoneme_type == PhonemeType.ESPEAK:
if self.config.espeak_voice == "ar":
# Arabic diacritization
# https://github.com/mush42/libtashkeel/
text = tashkeel_run(text)
return phonemize_espeak(text, self.config.espeak_voice)
if self.config.phoneme_type == PhonemeType.TEXT:
return phonemize_codepoints(text)
raise ValueError(f"Unexpected phoneme type: {self.config.phoneme_type}")
def phonemes_to_ids(self, phonemes: List[str]) -> List[int]:
"""Phonemes to ids."""
id_map = self.config.phoneme_id_map
ids: List[int] = list(id_map[BOS])
for phoneme in phonemes:
if phoneme not in id_map:
_LOGGER.warning("Missing phoneme from id map: %s", phoneme)
continue
ids.extend(id_map[phoneme])
ids.extend(id_map[PAD])
ids.extend(id_map[EOS])
return ids
def synthesize(
self,
text: str,
wav_file: wave.Wave_write,
speaker_id: Optional[int] = None,
length_scale: Optional[float] = None,
noise_scale: Optional[float] = None,
noise_w: Optional[float] = None,
sentence_silence: float = 0.0,
):
"""Synthesize WAV audio from text."""
wav_file.setframerate(self.config.sample_rate)
wav_file.setsampwidth(2) # 16-bit
wav_file.setnchannels(1) # mono
for audio_bytes in self.synthesize_stream_raw(
text,
speaker_id=speaker_id,
length_scale=length_scale,
noise_scale=noise_scale,
noise_w=noise_w,
sentence_silence=sentence_silence,
):
wav_file.writeframes(audio_bytes)
def synthesize_stream_raw(
self,
text: str,
speaker_id: Optional[int] = None,
length_scale: Optional[float] = None,
noise_scale: Optional[float] = None,
noise_w: Optional[float] = None,
sentence_silence: float = 0.0,
) -> Iterable[bytes]:
"""Synthesize raw audio per sentence from text."""
sentence_phonemes = self.phonemize(text)
# 16-bit mono
num_silence_samples = int(sentence_silence * self.config.sample_rate)
silence_bytes = bytes(num_silence_samples * 2)
for phonemes in sentence_phonemes:
phoneme_ids = self.phonemes_to_ids(phonemes)
yield self.synthesize_ids_to_raw(
phoneme_ids,
speaker_id=speaker_id,
length_scale=length_scale,
noise_scale=noise_scale,
noise_w=noise_w,
) + silence_bytes
def synthesize_ids_to_raw(
self,
phoneme_ids: List[int],
speaker_id: Optional[int] = None,
length_scale: Optional[float] = None,
noise_scale: Optional[float] = None,
noise_w: Optional[float] = None,
) -> bytes:
"""Synthesize raw audio from phoneme ids."""
if length_scale is None:
length_scale = self.config.length_scale
if noise_scale is None:
noise_scale = self.config.noise_scale
if noise_w is None:
noise_w = self.config.noise_w
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
scales = np.array(
[noise_scale, length_scale, noise_w],
dtype=np.float32,
)
args = {
"input": phoneme_ids_array,
"input_lengths": phoneme_ids_lengths,
"scales": scales
}
if self.config.num_speakers <= 1:
speaker_id = None
if (self.config.num_speakers > 1) and (speaker_id is None):
# Default speaker
speaker_id = 0
if speaker_id is not None:
sid = np.array([speaker_id], dtype=np.int64)
args["sid"] = sid
# Synthesize through Onnx
audio = self.session.run(None, args, )[0].squeeze((0, 1))
audio = audio_float_to_int16(audio.squeeze())
return audio.tobytes()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,37 @@
[MESSAGES CONTROL]
disable=
format,
abstract-method,
cyclic-import,
duplicate-code,
global-statement,
import-outside-toplevel,
inconsistent-return-statements,
locally-disabled,
not-context-manager,
too-few-public-methods,
too-many-arguments,
too-many-branches,
too-many-instance-attributes,
too-many-lines,
too-many-locals,
too-many-public-methods,
too-many-return-statements,
too-many-statements,
too-many-boolean-expressions,
unnecessary-pass,
unused-argument,
broad-except,
too-many-nested-blocks,
invalid-name,
unused-import,
fixme,
useless-super-delegation,
missing-module-docstring,
missing-class-docstring,
missing-function-docstring,
import-error,
relative-beyond-top-level
[FORMAT]
expected-line-ending-format=LF

View File

@@ -0,0 +1,2 @@
piper-phonemize~=1.1.0
onnxruntime>=1.11.0,<2

View File

@@ -0,0 +1,5 @@
black==22.12.0
flake8==6.0.0
isort==5.11.3
mypy==0.991
pylint==2.15.9

View File

@@ -0,0 +1 @@
onnxruntime-gpu>=1.11.0,<2

View File

@@ -0,0 +1 @@
flask>=3,<4

View File

@@ -0,0 +1,13 @@
#!/usr/bin/env python3
import subprocess
import venv
from pathlib import Path
_DIR = Path(__file__).parent
_PROGRAM_DIR = _DIR.parent
_VENV_DIR = _PROGRAM_DIR / ".venv"
_MODULE_DIR = _PROGRAM_DIR / "piper"
context = venv.EnvBuilder().ensure_directories(_VENV_DIR)
subprocess.check_call([context.env_exe, "-m", "black", str(_MODULE_DIR)])
subprocess.check_call([context.env_exe, "-m", "isort", str(_MODULE_DIR)])

View File

@@ -0,0 +1,16 @@
#!/usr/bin/env python3
import subprocess
import venv
from pathlib import Path
_DIR = Path(__file__).parent
_PROGRAM_DIR = _DIR.parent
_VENV_DIR = _PROGRAM_DIR / ".venv"
_MODULE_DIR = _PROGRAM_DIR / "piper"
context = venv.EnvBuilder().ensure_directories(_VENV_DIR)
subprocess.check_call([context.env_exe, "-m", "black", str(_MODULE_DIR), "--check"])
subprocess.check_call([context.env_exe, "-m", "isort", str(_MODULE_DIR), "--check"])
subprocess.check_call([context.env_exe, "-m", "flake8", str(_MODULE_DIR)])
subprocess.check_call([context.env_exe, "-m", "pylint", str(_MODULE_DIR)])
subprocess.check_call([context.env_exe, "-m", "mypy", str(_MODULE_DIR)])

View File

@@ -0,0 +1,12 @@
#!/usr/bin/env python3
import sys
import subprocess
import venv
from pathlib import Path
_DIR = Path(__file__).parent
_PROGRAM_DIR = _DIR.parent
_VENV_DIR = _PROGRAM_DIR / ".venv"
context = venv.EnvBuilder().ensure_directories(_VENV_DIR)
subprocess.check_call([context.env_exe, "-m", "piper"] + sys.argv[1:])

View File

@@ -0,0 +1,31 @@
#!/usr/bin/env python3
import subprocess
import venv
from pathlib import Path
_DIR = Path(__file__).parent
_PROGRAM_DIR = _DIR.parent
_VENV_DIR = _PROGRAM_DIR / ".venv"
# Create virtual environment
builder = venv.EnvBuilder(with_pip=True)
context = builder.ensure_directories(_VENV_DIR)
builder.create(_VENV_DIR)
# Upgrade dependencies
pip = [context.env_exe, "-m", "pip"]
subprocess.check_call(pip + ["install", "--upgrade", "pip"])
subprocess.check_call(pip + ["install", "--upgrade", "setuptools", "wheel"])
# Install requirements
subprocess.check_call(
pip
+ [
"install",
"-f",
"https://synesthesiam.github.io/prebuilt-apps/",
"-r",
str(_PROGRAM_DIR / "requirements.txt"),
]
)

View File

@@ -0,0 +1,22 @@
[flake8]
# To work with Black
max-line-length = 88
# E501: line too long
# W503: Line break occurred before a binary operator
# E203: Whitespace before ':'
# D202 No blank lines allowed after function docstring
# W504 line break after binary operator
ignore =
E501,
W503,
E203,
D202,
W504
[isort]
multi_line_output = 3
include_trailing_comma=True
force_grid_wrap=0
use_parentheses=True
line_length=88
indent = " "

View File

@@ -0,0 +1,48 @@
#!/usr/bin/env python3
from pathlib import Path
import setuptools
from setuptools import setup
this_dir = Path(__file__).parent
module_dir = this_dir / "piper"
requirements = []
requirements_path = this_dir / "requirements.txt"
if requirements_path.is_file():
with open(requirements_path, "r", encoding="utf-8") as requirements_file:
requirements = requirements_file.read().splitlines()
data_files = [module_dir / "voices.json"]
# -----------------------------------------------------------------------------
setup(
name="piper-tts",
version="1.2.0",
description="A fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4.",
url="http://github.com/rhasspy/piper",
author="Michael Hansen",
author_email="mike@rhasspy.org",
license="MIT",
packages=setuptools.find_packages(),
package_data={"piper": [str(p.relative_to(module_dir)) for p in data_files]},
entry_points={
"console_scripts": [
"piper = piper.__main__:main",
]
},
install_requires=requirements,
extras_require={"gpu": ["onnxruntime-gpu>=1.11.0,<2"], "http": ["flask>=3,<4"]},
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"Topic :: Text Processing :: Linguistic",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
],
keywords="rhasspy piper tts",
)