update README
This commit is contained in:
96
mlu_370-piper/piper/src/benchmark/benchmark_generator.py
Normal file
96
mlu_370-piper/piper/src/benchmark/benchmark_generator.py
Normal file
@@ -0,0 +1,96 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
import json
|
||||
import time
|
||||
import statistics
|
||||
import sys
|
||||
|
||||
import torch
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"-m", "--model", required=True, help="Path to generator file (.pt)"
|
||||
)
|
||||
parser.add_argument("-c", "--config", help="Path to model config file (.json)")
|
||||
args = parser.parse_args()
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
if not args.config:
|
||||
args.config = f"{args.model}.json"
|
||||
|
||||
with open(args.config, "r", encoding="utf-8") as config_file:
|
||||
config = json.load(config_file)
|
||||
|
||||
sample_rate = config["audio"]["sample_rate"]
|
||||
utterances = [json.loads(line) for line in sys.stdin]
|
||||
|
||||
start_time = time.monotonic_ns()
|
||||
model = torch.load(args.model)
|
||||
end_time = time.monotonic_ns()
|
||||
|
||||
model.eval()
|
||||
|
||||
load_sec = (end_time - start_time) / 1e9
|
||||
synthesize_rtf = []
|
||||
for utterance in utterances:
|
||||
phoneme_ids = utterance["phoneme_ids"]
|
||||
speaker_id = utterance.get("speaker_id")
|
||||
synthesize_rtf.append(
|
||||
synthesize(
|
||||
model,
|
||||
phoneme_ids,
|
||||
speaker_id,
|
||||
sample_rate,
|
||||
)
|
||||
)
|
||||
|
||||
json.dump(
|
||||
{
|
||||
"load_sec": load_sec,
|
||||
"rtf_mean": statistics.mean(synthesize_rtf),
|
||||
"rtf_stdev": statistics.stdev(synthesize_rtf),
|
||||
"synthesize_rtf": synthesize_rtf,
|
||||
},
|
||||
sys.stdout,
|
||||
)
|
||||
|
||||
|
||||
def synthesize(model, phoneme_ids, speaker_id, sample_rate) -> float:
|
||||
text = torch.LongTensor(phoneme_ids).unsqueeze(0)
|
||||
text_lengths = torch.LongTensor([len(phoneme_ids)])
|
||||
sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
|
||||
|
||||
start_time = time.monotonic_ns()
|
||||
audio = (
|
||||
model(
|
||||
text,
|
||||
text_lengths,
|
||||
sid,
|
||||
)[0]
|
||||
.detach()
|
||||
.numpy()
|
||||
.squeeze()
|
||||
)
|
||||
end_time = time.monotonic_ns()
|
||||
|
||||
audio_sec = len(audio) / sample_rate
|
||||
infer_sec = (end_time - start_time) / 1e9
|
||||
rtf = infer_sec / audio_sec
|
||||
|
||||
_LOGGER.debug(
|
||||
"Real-time factor: %s (infer=%s sec, audio=%s sec)",
|
||||
rtf,
|
||||
infer_sec,
|
||||
audio_sec,
|
||||
)
|
||||
|
||||
return rtf
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
125
mlu_370-piper/piper/src/benchmark/benchmark_onnx.py
Normal file
125
mlu_370-piper/piper/src/benchmark/benchmark_onnx.py
Normal file
@@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
import json
|
||||
import time
|
||||
import statistics
|
||||
import sys
|
||||
|
||||
import onnxruntime
|
||||
import numpy as np
|
||||
|
||||
_NOISE_SCALE = 0.667
|
||||
_LENGTH_SCALE = 1.0
|
||||
_NOISE_W = 0.8
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"-m", "--model", required=True, help="Path to Onnx model file (.onnx)"
|
||||
)
|
||||
parser.add_argument("-c", "--config", help="Path to model config file (.json)")
|
||||
args = parser.parse_args()
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
if not args.config:
|
||||
args.config = f"{args.model}.json"
|
||||
|
||||
with open(args.config, "r", encoding="utf-8") as config_file:
|
||||
config = json.load(config_file)
|
||||
|
||||
sample_rate = config["audio"]["sample_rate"]
|
||||
utterances = [json.loads(line) for line in sys.stdin]
|
||||
|
||||
start_time = time.monotonic_ns()
|
||||
|
||||
session_options = onnxruntime.SessionOptions()
|
||||
session_options.graph_optimization_level = (
|
||||
onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
|
||||
)
|
||||
# session_options.enable_cpu_mem_arena = False
|
||||
# session_options.enable_mem_pattern = False
|
||||
session_options.enable_mem_reuse = False
|
||||
# session_options.enable_profiling = False
|
||||
# session_options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL
|
||||
# session_options.execution_order = onnxruntime.ExecutionOrder.PRIORITY_BASED
|
||||
|
||||
session = onnxruntime.InferenceSession(
|
||||
args.model,
|
||||
sess_options=session_options,
|
||||
)
|
||||
# session.intra_op_num_threads = 1
|
||||
# session.inter_op_num_threads = 1
|
||||
|
||||
end_time = time.monotonic_ns()
|
||||
|
||||
load_sec = (end_time - start_time) / 1e9
|
||||
synthesize_rtf = []
|
||||
for utterance in utterances:
|
||||
phoneme_ids = utterance["phoneme_ids"]
|
||||
speaker_id = utterance.get("speaker_id")
|
||||
synthesize_rtf.append(
|
||||
synthesize(
|
||||
session,
|
||||
phoneme_ids,
|
||||
speaker_id,
|
||||
sample_rate,
|
||||
)
|
||||
)
|
||||
|
||||
json.dump(
|
||||
{
|
||||
"load_sec": load_sec,
|
||||
"rtf_mean": statistics.mean(synthesize_rtf),
|
||||
"rtf_stdev": statistics.stdev(synthesize_rtf),
|
||||
"rtfs": synthesize_rtf,
|
||||
},
|
||||
sys.stdout,
|
||||
)
|
||||
|
||||
|
||||
def synthesize(session, phoneme_ids, speaker_id, sample_rate) -> float:
|
||||
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
||||
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
|
||||
scales = np.array(
|
||||
[_NOISE_SCALE, _LENGTH_SCALE, _NOISE_W],
|
||||
dtype=np.float32,
|
||||
)
|
||||
|
||||
sid = None
|
||||
|
||||
if speaker_id is not None:
|
||||
sid = np.array([speaker_id], dtype=np.int64)
|
||||
|
||||
# Synthesize through Onnx
|
||||
start_time = time.monotonic_ns()
|
||||
audio = session.run(
|
||||
None,
|
||||
{
|
||||
"input": phoneme_ids_array,
|
||||
"input_lengths": phoneme_ids_lengths,
|
||||
"scales": scales,
|
||||
"sid": sid,
|
||||
},
|
||||
)[0].squeeze()
|
||||
end_time = time.monotonic_ns()
|
||||
|
||||
audio_sec = len(audio) / sample_rate
|
||||
infer_sec = (end_time - start_time) / 1e9
|
||||
rtf = infer_sec / audio_sec
|
||||
|
||||
_LOGGER.debug(
|
||||
"Real-time factor: %s (infer=%s sec, audio=%s sec)",
|
||||
rtf,
|
||||
infer_sec,
|
||||
audio_sec,
|
||||
)
|
||||
|
||||
return rtf
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
103
mlu_370-piper/piper/src/benchmark/benchmark_torchscript.py
Normal file
103
mlu_370-piper/piper/src/benchmark/benchmark_torchscript.py
Normal file
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
import json
|
||||
import time
|
||||
import statistics
|
||||
import sys
|
||||
|
||||
import torch
|
||||
|
||||
_NOISE_SCALE = 0.667
|
||||
_LENGTH_SCALE = 1.0
|
||||
_NOISE_W = 0.8
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"-m", "--model", required=True, help="Path to Torchscript file (.ts)"
|
||||
)
|
||||
parser.add_argument("-c", "--config", help="Path to model config file (.json)")
|
||||
args = parser.parse_args()
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
if not args.config:
|
||||
args.config = f"{args.model}.json"
|
||||
|
||||
with open(args.config, "r", encoding="utf-8") as config_file:
|
||||
config = json.load(config_file)
|
||||
|
||||
sample_rate = config["audio"]["sample_rate"]
|
||||
utterances = [json.loads(line) for line in sys.stdin]
|
||||
|
||||
start_time = time.monotonic_ns()
|
||||
model = torch.jit.load(args.model)
|
||||
end_time = time.monotonic_ns()
|
||||
|
||||
model.eval()
|
||||
|
||||
load_sec = (end_time - start_time) / 1e9
|
||||
synthesize_rtf = []
|
||||
for utterance in utterances:
|
||||
phoneme_ids = utterance["phoneme_ids"]
|
||||
speaker_id = utterance.get("speaker_id")
|
||||
synthesize_rtf.append(
|
||||
synthesize(
|
||||
model,
|
||||
phoneme_ids,
|
||||
speaker_id,
|
||||
sample_rate,
|
||||
)
|
||||
)
|
||||
|
||||
json.dump(
|
||||
{
|
||||
"load_sec": load_sec,
|
||||
"rtf_mean": statistics.mean(synthesize_rtf),
|
||||
"rtf_stdev": statistics.stdev(synthesize_rtf),
|
||||
"synthesize_rtf": synthesize_rtf,
|
||||
},
|
||||
sys.stdout,
|
||||
)
|
||||
|
||||
|
||||
def synthesize(model, phoneme_ids, speaker_id, sample_rate) -> float:
|
||||
text = torch.LongTensor(phoneme_ids).unsqueeze(0)
|
||||
text_lengths = torch.LongTensor([len(phoneme_ids)])
|
||||
sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
|
||||
|
||||
start_time = time.monotonic_ns()
|
||||
audio = (
|
||||
model(
|
||||
text,
|
||||
text_lengths,
|
||||
sid,
|
||||
torch.FloatTensor([_NOISE_SCALE]),
|
||||
torch.FloatTensor([_LENGTH_SCALE]),
|
||||
torch.FloatTensor([_NOISE_W]),
|
||||
)[0]
|
||||
.detach()
|
||||
.numpy()
|
||||
.squeeze()
|
||||
)
|
||||
end_time = time.monotonic_ns()
|
||||
|
||||
audio_sec = len(audio) / sample_rate
|
||||
infer_sec = (end_time - start_time) / 1e9
|
||||
rtf = infer_sec / audio_sec
|
||||
|
||||
_LOGGER.debug(
|
||||
"Real-time factor: %s (infer=%s sec, audio=%s sec)",
|
||||
rtf,
|
||||
infer_sec,
|
||||
audio_sec,
|
||||
)
|
||||
|
||||
return rtf
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
2
mlu_370-piper/piper/src/benchmark/requirements.txt
Normal file
2
mlu_370-piper/piper/src/benchmark/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
onnxruntime~=1.11.0
|
||||
torch~=1.11.0
|
||||
24596
mlu_370-piper/piper/src/cpp/json.hpp
Normal file
24596
mlu_370-piper/piper/src/cpp/json.hpp
Normal file
File diff suppressed because it is too large
Load Diff
561
mlu_370-piper/piper/src/cpp/main.cpp
Normal file
561
mlu_370-piper/piper/src/cpp/main.cpp
Normal file
@@ -0,0 +1,561 @@
|
||||
#include <chrono>
|
||||
#include <condition_variable>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <fcntl.h>
|
||||
#include <io.h>
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <mach-o/dyld.h>
|
||||
#endif
|
||||
|
||||
#include <spdlog/sinks/stdout_color_sinks.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "json.hpp"
|
||||
#include "piper.hpp"
|
||||
|
||||
using namespace std;
|
||||
using json = nlohmann::json;
|
||||
|
||||
enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW };
|
||||
|
||||
struct RunConfig {
|
||||
// Path to .onnx voice file
|
||||
filesystem::path modelPath;
|
||||
|
||||
// Path to JSON voice config file
|
||||
filesystem::path modelConfigPath;
|
||||
|
||||
// Type of output to produce.
|
||||
// Default is to write a WAV file in the current directory.
|
||||
OutputType outputType = OUTPUT_DIRECTORY;
|
||||
|
||||
// Path for output
|
||||
optional<filesystem::path> outputPath = filesystem::path(".");
|
||||
|
||||
// Numerical id of the default speaker (multi-speaker voices)
|
||||
optional<piper::SpeakerId> speakerId;
|
||||
|
||||
// Amount of noise to add during audio generation
|
||||
optional<float> noiseScale;
|
||||
|
||||
// Speed of speaking (1 = normal, < 1 is faster, > 1 is slower)
|
||||
optional<float> lengthScale;
|
||||
|
||||
// Variation in phoneme lengths
|
||||
optional<float> noiseW;
|
||||
|
||||
// Seconds of silence to add after each sentence
|
||||
optional<float> sentenceSilenceSeconds;
|
||||
|
||||
// Path to espeak-ng data directory (default is next to piper executable)
|
||||
optional<filesystem::path> eSpeakDataPath;
|
||||
|
||||
// Path to libtashkeel ort model
|
||||
// https://github.com/mush42/libtashkeel/
|
||||
optional<filesystem::path> tashkeelModelPath;
|
||||
|
||||
// stdin input is lines of JSON instead of text with format:
|
||||
// {
|
||||
// "text": str, (required)
|
||||
// "speaker_id": int, (optional)
|
||||
// "speaker": str, (optional)
|
||||
// "output_file": str, (optional)
|
||||
// }
|
||||
bool jsonInput = false;
|
||||
|
||||
// Seconds of extra silence to insert after a single phoneme
|
||||
optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
|
||||
|
||||
// true to use CUDA execution provider
|
||||
bool useCuda = false;
|
||||
};
|
||||
|
||||
void parseArgs(int argc, char *argv[], RunConfig &runConfig);
|
||||
void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
|
||||
condition_variable &cvAudio, bool &audioReady,
|
||||
bool &audioFinished);
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
spdlog::set_default_logger(spdlog::stderr_color_st("piper"));
|
||||
|
||||
RunConfig runConfig;
|
||||
parseArgs(argc, argv, runConfig);
|
||||
|
||||
#ifdef _WIN32
|
||||
// Required on Windows to show IPA symbols
|
||||
SetConsoleOutputCP(CP_UTF8);
|
||||
#endif
|
||||
|
||||
piper::PiperConfig piperConfig;
|
||||
piper::Voice voice;
|
||||
|
||||
spdlog::debug("Loading voice from {} (config={})",
|
||||
runConfig.modelPath.string(),
|
||||
runConfig.modelConfigPath.string());
|
||||
|
||||
auto startTime = chrono::steady_clock::now();
|
||||
loadVoice(piperConfig, runConfig.modelPath.string(),
|
||||
runConfig.modelConfigPath.string(), voice, runConfig.speakerId,
|
||||
runConfig.useCuda);
|
||||
auto endTime = chrono::steady_clock::now();
|
||||
spdlog::info("Loaded voice in {} second(s)",
|
||||
chrono::duration<double>(endTime - startTime).count());
|
||||
|
||||
// Get the path to the piper executable so we can locate espeak-ng-data, etc.
|
||||
// next to it.
|
||||
#ifdef _MSC_VER
|
||||
auto exePath = []() {
|
||||
wchar_t moduleFileName[MAX_PATH] = {0};
|
||||
GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
|
||||
return filesystem::path(moduleFileName);
|
||||
}();
|
||||
#else
|
||||
#ifdef __APPLE__
|
||||
auto exePath = []() {
|
||||
char moduleFileName[PATH_MAX] = {0};
|
||||
uint32_t moduleFileNameSize = std::size(moduleFileName);
|
||||
_NSGetExecutablePath(moduleFileName, &moduleFileNameSize);
|
||||
return filesystem::path(moduleFileName);
|
||||
}();
|
||||
#else
|
||||
auto exePath = filesystem::canonical("/proc/self/exe");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) {
|
||||
spdlog::debug("Voice uses eSpeak phonemes ({})",
|
||||
voice.phonemizeConfig.eSpeak.voice);
|
||||
|
||||
if (runConfig.eSpeakDataPath) {
|
||||
// User provided path
|
||||
piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string();
|
||||
} else {
|
||||
// Assume next to piper executable
|
||||
piperConfig.eSpeakDataPath =
|
||||
std::filesystem::absolute(
|
||||
exePath.parent_path().append("espeak-ng-data"))
|
||||
.string();
|
||||
|
||||
spdlog::debug("espeak-ng-data directory is expected at {}",
|
||||
piperConfig.eSpeakDataPath);
|
||||
}
|
||||
} else {
|
||||
// Not using eSpeak
|
||||
piperConfig.useESpeak = false;
|
||||
}
|
||||
|
||||
// Enable libtashkeel for Arabic
|
||||
if (voice.phonemizeConfig.eSpeak.voice == "ar") {
|
||||
piperConfig.useTashkeel = true;
|
||||
if (runConfig.tashkeelModelPath) {
|
||||
// User provided path
|
||||
piperConfig.tashkeelModelPath =
|
||||
runConfig.tashkeelModelPath.value().string();
|
||||
} else {
|
||||
// Assume next to piper executable
|
||||
piperConfig.tashkeelModelPath =
|
||||
std::filesystem::absolute(
|
||||
exePath.parent_path().append("libtashkeel_model.ort"))
|
||||
.string();
|
||||
|
||||
spdlog::debug("libtashkeel model is expected at {}",
|
||||
piperConfig.tashkeelModelPath.value());
|
||||
}
|
||||
}
|
||||
|
||||
piper::initialize(piperConfig);
|
||||
|
||||
// Scales
|
||||
if (runConfig.noiseScale) {
|
||||
voice.synthesisConfig.noiseScale = runConfig.noiseScale.value();
|
||||
}
|
||||
|
||||
if (runConfig.lengthScale) {
|
||||
voice.synthesisConfig.lengthScale = runConfig.lengthScale.value();
|
||||
}
|
||||
|
||||
if (runConfig.noiseW) {
|
||||
voice.synthesisConfig.noiseW = runConfig.noiseW.value();
|
||||
}
|
||||
|
||||
if (runConfig.sentenceSilenceSeconds) {
|
||||
voice.synthesisConfig.sentenceSilenceSeconds =
|
||||
runConfig.sentenceSilenceSeconds.value();
|
||||
}
|
||||
|
||||
if (runConfig.phonemeSilenceSeconds) {
|
||||
if (!voice.synthesisConfig.phonemeSilenceSeconds) {
|
||||
// Overwrite
|
||||
voice.synthesisConfig.phonemeSilenceSeconds =
|
||||
runConfig.phonemeSilenceSeconds;
|
||||
} else {
|
||||
// Merge
|
||||
for (const auto &[phoneme, silenceSeconds] :
|
||||
*runConfig.phonemeSilenceSeconds) {
|
||||
voice.synthesisConfig.phonemeSilenceSeconds->try_emplace(
|
||||
phoneme, silenceSeconds);
|
||||
}
|
||||
}
|
||||
|
||||
} // if phonemeSilenceSeconds
|
||||
|
||||
if (runConfig.outputType == OUTPUT_DIRECTORY) {
|
||||
runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
|
||||
spdlog::info("Output directory: {}", runConfig.outputPath.value().string());
|
||||
}
|
||||
|
||||
string line;
|
||||
piper::SynthesisResult result;
|
||||
while (getline(cin, line)) {
|
||||
auto outputType = runConfig.outputType;
|
||||
auto speakerId = voice.synthesisConfig.speakerId;
|
||||
std::optional<filesystem::path> maybeOutputPath = runConfig.outputPath;
|
||||
|
||||
if (runConfig.jsonInput) {
|
||||
// Each line is a JSON object
|
||||
json lineRoot = json::parse(line);
|
||||
|
||||
// Text is required
|
||||
line = lineRoot["text"].get<std::string>();
|
||||
|
||||
if (lineRoot.contains("output_file")) {
|
||||
// Override output WAV file path
|
||||
outputType = OUTPUT_FILE;
|
||||
maybeOutputPath =
|
||||
filesystem::path(lineRoot["output_file"].get<std::string>());
|
||||
}
|
||||
|
||||
if (lineRoot.contains("speaker_id")) {
|
||||
// Override speaker id
|
||||
voice.synthesisConfig.speakerId =
|
||||
lineRoot["speaker_id"].get<piper::SpeakerId>();
|
||||
} else if (lineRoot.contains("speaker")) {
|
||||
// Resolve to id using speaker id map
|
||||
auto speakerName = lineRoot["speaker"].get<std::string>();
|
||||
if ((voice.modelConfig.speakerIdMap) &&
|
||||
(voice.modelConfig.speakerIdMap->count(speakerName) > 0)) {
|
||||
voice.synthesisConfig.speakerId =
|
||||
(*voice.modelConfig.speakerIdMap)[speakerName];
|
||||
} else {
|
||||
spdlog::warn("No speaker named: {}", speakerName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Timestamp is used for path to output WAV file
|
||||
const auto now = chrono::system_clock::now();
|
||||
const auto timestamp =
|
||||
chrono::duration_cast<chrono::nanoseconds>(now.time_since_epoch())
|
||||
.count();
|
||||
|
||||
if (outputType == OUTPUT_DIRECTORY) {
|
||||
// Generate path using timestamp
|
||||
stringstream outputName;
|
||||
outputName << timestamp << ".wav";
|
||||
filesystem::path outputPath = runConfig.outputPath.value();
|
||||
outputPath.append(outputName.str());
|
||||
|
||||
// Output audio to automatically-named WAV file in a directory
|
||||
ofstream audioFile(outputPath.string(), ios::binary);
|
||||
piper::textToWavFile(piperConfig, voice, line, audioFile, result);
|
||||
cout << outputPath.string() << endl;
|
||||
} else if (outputType == OUTPUT_FILE) {
|
||||
if (!maybeOutputPath || maybeOutputPath->empty()) {
|
||||
throw runtime_error("No output path provided");
|
||||
}
|
||||
|
||||
filesystem::path outputPath = maybeOutputPath.value();
|
||||
|
||||
if (!runConfig.jsonInput) {
|
||||
// Read all of standard input before synthesizing.
|
||||
// Otherwise, we would overwrite the output file for each line.
|
||||
stringstream text;
|
||||
text << line;
|
||||
while (getline(cin, line)) {
|
||||
text << " " << line;
|
||||
}
|
||||
|
||||
line = text.str();
|
||||
}
|
||||
|
||||
// Output audio to WAV file
|
||||
ofstream audioFile(outputPath.string(), ios::binary);
|
||||
piper::textToWavFile(piperConfig, voice, line, audioFile, result);
|
||||
cout << outputPath.string() << endl;
|
||||
} else if (outputType == OUTPUT_STDOUT) {
|
||||
// Output WAV to stdout
|
||||
piper::textToWavFile(piperConfig, voice, line, cout, result);
|
||||
} else if (outputType == OUTPUT_RAW) {
|
||||
// Raw output to stdout
|
||||
mutex mutAudio;
|
||||
condition_variable cvAudio;
|
||||
bool audioReady = false;
|
||||
bool audioFinished = false;
|
||||
vector<int16_t> audioBuffer;
|
||||
vector<int16_t> sharedAudioBuffer;
|
||||
|
||||
#ifdef _WIN32
|
||||
// Needed on Windows to avoid terminal conversions
|
||||
setmode(fileno(stdout), O_BINARY);
|
||||
setmode(fileno(stdin), O_BINARY);
|
||||
#endif
|
||||
|
||||
thread rawOutputThread(rawOutputProc, ref(sharedAudioBuffer),
|
||||
ref(mutAudio), ref(cvAudio), ref(audioReady),
|
||||
ref(audioFinished));
|
||||
auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio,
|
||||
&cvAudio, &audioReady]() {
|
||||
// Signal thread that audio is ready
|
||||
{
|
||||
unique_lock lockAudio(mutAudio);
|
||||
copy(audioBuffer.begin(), audioBuffer.end(),
|
||||
back_inserter(sharedAudioBuffer));
|
||||
audioReady = true;
|
||||
cvAudio.notify_one();
|
||||
}
|
||||
};
|
||||
piper::textToAudio(piperConfig, voice, line, audioBuffer, result,
|
||||
audioCallback);
|
||||
|
||||
// Signal thread that there is no more audio
|
||||
{
|
||||
unique_lock lockAudio(mutAudio);
|
||||
audioReady = true;
|
||||
audioFinished = true;
|
||||
cvAudio.notify_one();
|
||||
}
|
||||
|
||||
// Wait for audio output to finish
|
||||
spdlog::info("Waiting for audio to finish playing...");
|
||||
rawOutputThread.join();
|
||||
}
|
||||
|
||||
spdlog::info("Real-time factor: {} (infer={} sec, audio={} sec)",
|
||||
result.realTimeFactor, result.inferSeconds,
|
||||
result.audioSeconds);
|
||||
|
||||
// Restore config (--json-input)
|
||||
voice.synthesisConfig.speakerId = speakerId;
|
||||
|
||||
} // for each line
|
||||
|
||||
piper::terminate(piperConfig);
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
|
||||
condition_variable &cvAudio, bool &audioReady,
|
||||
bool &audioFinished) {
|
||||
vector<int16_t> internalAudioBuffer;
|
||||
while (true) {
|
||||
{
|
||||
unique_lock lockAudio{mutAudio};
|
||||
cvAudio.wait(lockAudio, [&audioReady] { return audioReady; });
|
||||
|
||||
if (sharedAudioBuffer.empty() && audioFinished) {
|
||||
break;
|
||||
}
|
||||
|
||||
copy(sharedAudioBuffer.begin(), sharedAudioBuffer.end(),
|
||||
back_inserter(internalAudioBuffer));
|
||||
|
||||
sharedAudioBuffer.clear();
|
||||
|
||||
if (!audioFinished) {
|
||||
audioReady = false;
|
||||
}
|
||||
}
|
||||
|
||||
cout.write((const char *)internalAudioBuffer.data(),
|
||||
sizeof(int16_t) * internalAudioBuffer.size());
|
||||
cout.flush();
|
||||
internalAudioBuffer.clear();
|
||||
}
|
||||
|
||||
} // rawOutputProc
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void printUsage(char *argv[]) {
|
||||
cerr << endl;
|
||||
cerr << "usage: " << argv[0] << " [options]" << endl;
|
||||
cerr << endl;
|
||||
cerr << "options:" << endl;
|
||||
cerr << " -h --help show this message and exit" << endl;
|
||||
cerr << " -m FILE --model FILE path to onnx model file" << endl;
|
||||
cerr << " -c FILE --config FILE path to model config file "
|
||||
"(default: model path + .json)"
|
||||
<< endl;
|
||||
cerr << " -f FILE --output_file FILE path to output WAV file ('-' for "
|
||||
"stdout)"
|
||||
<< endl;
|
||||
cerr << " -d DIR --output_dir DIR path to output directory (default: "
|
||||
"cwd)"
|
||||
<< endl;
|
||||
cerr << " --output_raw output raw audio to stdout as it "
|
||||
"becomes available"
|
||||
<< endl;
|
||||
cerr << " -s NUM --speaker NUM id of speaker (default: 0)" << endl;
|
||||
cerr << " --noise_scale NUM generator noise (default: 0.667)"
|
||||
<< endl;
|
||||
cerr << " --length_scale NUM phoneme length (default: 1.0)"
|
||||
<< endl;
|
||||
cerr << " --noise_w NUM phoneme width noise (default: 0.8)"
|
||||
<< endl;
|
||||
cerr << " --sentence_silence NUM seconds of silence after each "
|
||||
"sentence (default: 0.2)"
|
||||
<< endl;
|
||||
cerr << " --espeak_data DIR path to espeak-ng data directory"
|
||||
<< endl;
|
||||
cerr << " --tashkeel_model FILE path to libtashkeel onnx model "
|
||||
"(arabic)"
|
||||
<< endl;
|
||||
cerr << " --json-input stdin input is lines of JSON "
|
||||
"instead of plain text"
|
||||
<< endl;
|
||||
cerr << " --use-cuda use CUDA execution provider"
|
||||
<< endl;
|
||||
cerr << " --debug print DEBUG messages to the console"
|
||||
<< endl;
|
||||
cerr << " -q --quiet disable logging" << endl;
|
||||
cerr << endl;
|
||||
}
|
||||
|
||||
void ensureArg(int argc, char *argv[], int argi) {
|
||||
if ((argi + 1) >= argc) {
|
||||
printUsage(argv);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
// Parse command-line arguments
|
||||
void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
|
||||
optional<filesystem::path> modelConfigPath;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
|
||||
if (arg == "-m" || arg == "--model") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.modelPath = filesystem::path(argv[++i]);
|
||||
} else if (arg == "-c" || arg == "--config") {
|
||||
ensureArg(argc, argv, i);
|
||||
modelConfigPath = filesystem::path(argv[++i]);
|
||||
} else if (arg == "-f" || arg == "--output_file" ||
|
||||
arg == "--output-file") {
|
||||
ensureArg(argc, argv, i);
|
||||
std::string filePath = argv[++i];
|
||||
if (filePath == "-") {
|
||||
runConfig.outputType = OUTPUT_STDOUT;
|
||||
runConfig.outputPath = nullopt;
|
||||
} else {
|
||||
runConfig.outputType = OUTPUT_FILE;
|
||||
runConfig.outputPath = filesystem::path(filePath);
|
||||
}
|
||||
} else if (arg == "-d" || arg == "--output_dir" || arg == "output-dir") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.outputType = OUTPUT_DIRECTORY;
|
||||
runConfig.outputPath = filesystem::path(argv[++i]);
|
||||
} else if (arg == "--output_raw" || arg == "--output-raw") {
|
||||
runConfig.outputType = OUTPUT_RAW;
|
||||
} else if (arg == "-s" || arg == "--speaker") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.speakerId = (piper::SpeakerId)stol(argv[++i]);
|
||||
} else if (arg == "--noise_scale" || arg == "--noise-scale") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.noiseScale = stof(argv[++i]);
|
||||
} else if (arg == "--length_scale" || arg == "--length-scale") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.lengthScale = stof(argv[++i]);
|
||||
} else if (arg == "--noise_w" || arg == "--noise-w") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.noiseW = stof(argv[++i]);
|
||||
} else if (arg == "--sentence_silence" || arg == "--sentence-silence") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.sentenceSilenceSeconds = stof(argv[++i]);
|
||||
} else if (arg == "--phoneme_silence" || arg == "--phoneme-silence") {
|
||||
ensureArg(argc, argv, i);
|
||||
ensureArg(argc, argv, i + 1);
|
||||
auto phonemeStr = std::string(argv[++i]);
|
||||
if (!piper::isSingleCodepoint(phonemeStr)) {
|
||||
std::cerr << "Phoneme '" << phonemeStr
|
||||
<< "' is not a single codepoint (--phoneme_silence)"
|
||||
<< std::endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (!runConfig.phonemeSilenceSeconds) {
|
||||
runConfig.phonemeSilenceSeconds.emplace();
|
||||
}
|
||||
|
||||
auto phoneme = piper::getCodepoint(phonemeStr);
|
||||
(*runConfig.phonemeSilenceSeconds)[phoneme] = stof(argv[++i]);
|
||||
} else if (arg == "--espeak_data" || arg == "--espeak-data") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.eSpeakDataPath = filesystem::path(argv[++i]);
|
||||
} else if (arg == "--tashkeel_model" || arg == "--tashkeel-model") {
|
||||
ensureArg(argc, argv, i);
|
||||
runConfig.tashkeelModelPath = filesystem::path(argv[++i]);
|
||||
} else if (arg == "--json_input" || arg == "--json-input") {
|
||||
runConfig.jsonInput = true;
|
||||
} else if (arg == "--use_cuda" || arg == "--use-cuda") {
|
||||
runConfig.useCuda = true;
|
||||
} else if (arg == "--version") {
|
||||
std::cout << piper::getVersion() << std::endl;
|
||||
exit(0);
|
||||
} else if (arg == "--debug") {
|
||||
// Set DEBUG logging
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
} else if (arg == "-q" || arg == "--quiet") {
|
||||
// diable logging
|
||||
spdlog::set_level(spdlog::level::off);
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
printUsage(argv);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
// Verify model file exists
|
||||
ifstream modelFile(runConfig.modelPath.c_str(), ios::binary);
|
||||
if (!modelFile.good()) {
|
||||
throw runtime_error("Model file doesn't exist");
|
||||
}
|
||||
|
||||
if (!modelConfigPath) {
|
||||
runConfig.modelConfigPath =
|
||||
filesystem::path(runConfig.modelPath.string() + ".json");
|
||||
} else {
|
||||
runConfig.modelConfigPath = modelConfigPath.value();
|
||||
}
|
||||
|
||||
// Verify model config exists
|
||||
ifstream modelConfigFile(runConfig.modelConfigPath.c_str());
|
||||
if (!modelConfigFile.good()) {
|
||||
throw runtime_error("Model config doesn't exist");
|
||||
}
|
||||
}
|
||||
636
mlu_370-piper/piper/src/cpp/piper.cpp
Normal file
636
mlu_370-piper/piper/src/cpp/piper.cpp
Normal file
@@ -0,0 +1,636 @@
|
||||
#include <array>
|
||||
#include <chrono>
|
||||
#include <fstream>
|
||||
#include <limits>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
|
||||
#include <espeak-ng/speak_lib.h>
|
||||
#include <onnxruntime_cxx_api.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "json.hpp"
|
||||
#include "piper.hpp"
|
||||
#include "utf8.h"
|
||||
#include "wavfile.hpp"
|
||||
|
||||
namespace piper {
|
||||
|
||||
#ifdef _PIPER_VERSION
|
||||
// https://stackoverflow.com/questions/47346133/how-to-use-a-define-inside-a-format-string
|
||||
#define _STR(x) #x
|
||||
#define STR(x) _STR(x)
|
||||
const std::string VERSION = STR(_PIPER_VERSION);
|
||||
#else
|
||||
const std::string VERSION = "";
|
||||
#endif
|
||||
|
||||
// Maximum value for 16-bit signed WAV sample
|
||||
const float MAX_WAV_VALUE = 32767.0f;
|
||||
|
||||
const std::string instanceName{"piper"};
|
||||
|
||||
std::string getVersion() { return VERSION; }
|
||||
|
||||
// True if the string is a single UTF-8 codepoint
|
||||
bool isSingleCodepoint(std::string s) {
|
||||
return utf8::distance(s.begin(), s.end()) == 1;
|
||||
}
|
||||
|
||||
// Get the first UTF-8 codepoint of a string
|
||||
Phoneme getCodepoint(std::string s) {
|
||||
utf8::iterator character_iter(s.begin(), s.begin(), s.end());
|
||||
return *character_iter;
|
||||
}
|
||||
|
||||
// Load JSON config information for phonemization
|
||||
void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
|
||||
// {
|
||||
// "espeak": {
|
||||
// "voice": "<language code>"
|
||||
// },
|
||||
// "phoneme_type": "<espeak or text>",
|
||||
// "phoneme_map": {
|
||||
// "<from phoneme>": ["<to phoneme 1>", "<to phoneme 2>", ...]
|
||||
// },
|
||||
// "phoneme_id_map": {
|
||||
// "<phoneme>": [<id1>, <id2>, ...]
|
||||
// }
|
||||
// }
|
||||
|
||||
if (configRoot.contains("espeak")) {
|
||||
auto espeakValue = configRoot["espeak"];
|
||||
if (espeakValue.contains("voice")) {
|
||||
phonemizeConfig.eSpeak.voice = espeakValue["voice"].get<std::string>();
|
||||
}
|
||||
}
|
||||
|
||||
if (configRoot.contains("phoneme_type")) {
|
||||
auto phonemeTypeStr = configRoot["phoneme_type"].get<std::string>();
|
||||
if (phonemeTypeStr == "text") {
|
||||
phonemizeConfig.phonemeType = TextPhonemes;
|
||||
}
|
||||
}
|
||||
|
||||
// phoneme to [id] map
|
||||
// Maps phonemes to one or more phoneme ids (required).
|
||||
if (configRoot.contains("phoneme_id_map")) {
|
||||
auto phonemeIdMapValue = configRoot["phoneme_id_map"];
|
||||
for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
|
||||
std::string fromPhoneme = fromPhonemeItem.key();
|
||||
if (!isSingleCodepoint(fromPhoneme)) {
|
||||
std::stringstream idsStr;
|
||||
for (auto &toIdValue : fromPhonemeItem.value()) {
|
||||
PhonemeId toId = toIdValue.get<PhonemeId>();
|
||||
idsStr << toId << ",";
|
||||
}
|
||||
|
||||
spdlog::error("\"{}\" is not a single codepoint (ids={})", fromPhoneme,
|
||||
idsStr.str());
|
||||
throw std::runtime_error(
|
||||
"Phonemes must be one codepoint (phoneme id map)");
|
||||
}
|
||||
|
||||
auto fromCodepoint = getCodepoint(fromPhoneme);
|
||||
for (auto &toIdValue : fromPhonemeItem.value()) {
|
||||
PhonemeId toId = toIdValue.get<PhonemeId>();
|
||||
phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// phoneme to [phoneme] map
|
||||
// Maps phonemes to one or more other phonemes (not normally used).
|
||||
if (configRoot.contains("phoneme_map")) {
|
||||
if (!phonemizeConfig.phonemeMap) {
|
||||
phonemizeConfig.phonemeMap.emplace();
|
||||
}
|
||||
|
||||
auto phonemeMapValue = configRoot["phoneme_map"];
|
||||
for (auto &fromPhonemeItem : phonemeMapValue.items()) {
|
||||
std::string fromPhoneme = fromPhonemeItem.key();
|
||||
if (!isSingleCodepoint(fromPhoneme)) {
|
||||
spdlog::error("\"{}\" is not a single codepoint", fromPhoneme);
|
||||
throw std::runtime_error(
|
||||
"Phonemes must be one codepoint (phoneme map)");
|
||||
}
|
||||
|
||||
auto fromCodepoint = getCodepoint(fromPhoneme);
|
||||
for (auto &toPhonemeValue : fromPhonemeItem.value()) {
|
||||
std::string toPhoneme = toPhonemeValue.get<std::string>();
|
||||
if (!isSingleCodepoint(toPhoneme)) {
|
||||
throw std::runtime_error(
|
||||
"Phonemes must be one codepoint (phoneme map)");
|
||||
}
|
||||
|
||||
auto toCodepoint = getCodepoint(toPhoneme);
|
||||
(*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} /* parsePhonemizeConfig */
|
||||
|
||||
// Load JSON config for audio synthesis
|
||||
void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
|
||||
// {
|
||||
// "audio": {
|
||||
// "sample_rate": 22050
|
||||
// },
|
||||
// "inference": {
|
||||
// "noise_scale": 0.667,
|
||||
// "length_scale": 1,
|
||||
// "noise_w": 0.8,
|
||||
// "phoneme_silence": {
|
||||
// "<phoneme>": <seconds of silence>,
|
||||
// ...
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
if (configRoot.contains("audio")) {
|
||||
auto audioValue = configRoot["audio"];
|
||||
if (audioValue.contains("sample_rate")) {
|
||||
// Default sample rate is 22050 Hz
|
||||
synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
|
||||
}
|
||||
}
|
||||
|
||||
if (configRoot.contains("inference")) {
|
||||
// Overrides default inference settings
|
||||
auto inferenceValue = configRoot["inference"];
|
||||
if (inferenceValue.contains("noise_scale")) {
|
||||
synthesisConfig.noiseScale = inferenceValue.value("noise_scale", 0.667f);
|
||||
}
|
||||
|
||||
if (inferenceValue.contains("length_scale")) {
|
||||
synthesisConfig.lengthScale = inferenceValue.value("length_scale", 1.0f);
|
||||
}
|
||||
|
||||
if (inferenceValue.contains("noise_w")) {
|
||||
synthesisConfig.noiseW = inferenceValue.value("noise_w", 0.8f);
|
||||
}
|
||||
|
||||
if (inferenceValue.contains("phoneme_silence")) {
|
||||
// phoneme -> seconds of silence to add after
|
||||
synthesisConfig.phonemeSilenceSeconds.emplace();
|
||||
auto phonemeSilenceValue = inferenceValue["phoneme_silence"];
|
||||
for (auto &phonemeItem : phonemeSilenceValue.items()) {
|
||||
std::string phonemeStr = phonemeItem.key();
|
||||
if (!isSingleCodepoint(phonemeStr)) {
|
||||
spdlog::error("\"{}\" is not a single codepoint", phonemeStr);
|
||||
throw std::runtime_error(
|
||||
"Phonemes must be one codepoint (phoneme silence)");
|
||||
}
|
||||
|
||||
auto phoneme = getCodepoint(phonemeStr);
|
||||
(*synthesisConfig.phonemeSilenceSeconds)[phoneme] =
|
||||
phonemeItem.value().get<float>();
|
||||
}
|
||||
|
||||
} // if phoneme_silence
|
||||
|
||||
} // if inference
|
||||
|
||||
} /* parseSynthesisConfig */
|
||||
|
||||
void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
|
||||
|
||||
modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
|
||||
|
||||
if (configRoot.contains("speaker_id_map")) {
|
||||
if (!modelConfig.speakerIdMap) {
|
||||
modelConfig.speakerIdMap.emplace();
|
||||
}
|
||||
|
||||
auto speakerIdMapValue = configRoot["speaker_id_map"];
|
||||
for (auto &speakerItem : speakerIdMapValue.items()) {
|
||||
std::string speakerName = speakerItem.key();
|
||||
(*modelConfig.speakerIdMap)[speakerName] =
|
||||
speakerItem.value().get<SpeakerId>();
|
||||
}
|
||||
}
|
||||
|
||||
} /* parseModelConfig */
|
||||
|
||||
void initialize(PiperConfig &config) {
|
||||
if (config.useESpeak) {
|
||||
// Set up espeak-ng for calling espeak_TextToPhonemesWithTerminator
|
||||
// See: https://github.com/rhasspy/espeak-ng
|
||||
spdlog::debug("Initializing eSpeak");
|
||||
int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
|
||||
/*buflength*/ 0,
|
||||
/*path*/ config.eSpeakDataPath.c_str(),
|
||||
/*options*/ 0);
|
||||
if (result < 0) {
|
||||
throw std::runtime_error("Failed to initialize eSpeak-ng");
|
||||
}
|
||||
|
||||
spdlog::debug("Initialized eSpeak");
|
||||
}
|
||||
|
||||
// Load onnx model for libtashkeel
|
||||
// https://github.com/mush42/libtashkeel/
|
||||
if (config.useTashkeel) {
|
||||
spdlog::debug("Using libtashkeel for diacritization");
|
||||
if (!config.tashkeelModelPath) {
|
||||
throw std::runtime_error("No path to libtashkeel model");
|
||||
}
|
||||
|
||||
spdlog::debug("Loading libtashkeel model from {}",
|
||||
config.tashkeelModelPath.value());
|
||||
config.tashkeelState = std::make_unique<tashkeel::State>();
|
||||
tashkeel::tashkeel_load(config.tashkeelModelPath.value(),
|
||||
*config.tashkeelState);
|
||||
spdlog::debug("Initialized libtashkeel");
|
||||
}
|
||||
|
||||
spdlog::info("Initialized piper");
|
||||
}
|
||||
|
||||
void terminate(PiperConfig &config) {
|
||||
if (config.useESpeak) {
|
||||
// Clean up espeak-ng
|
||||
spdlog::debug("Terminating eSpeak");
|
||||
espeak_Terminate();
|
||||
spdlog::debug("Terminated eSpeak");
|
||||
}
|
||||
|
||||
spdlog::info("Terminated piper");
|
||||
}
|
||||
|
||||
void loadModel(std::string modelPath, ModelSession &session, bool useCuda) {
|
||||
spdlog::debug("Loading onnx model from {}", modelPath);
|
||||
session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
|
||||
instanceName.c_str());
|
||||
session.env.DisableTelemetryEvents();
|
||||
|
||||
if (useCuda) {
|
||||
// Use CUDA provider
|
||||
OrtCUDAProviderOptions cuda_options{};
|
||||
cuda_options.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchHeuristic;
|
||||
session.options.AppendExecutionProvider_CUDA(cuda_options);
|
||||
}
|
||||
|
||||
// Slows down performance by ~2x
|
||||
// session.options.SetIntraOpNumThreads(1);
|
||||
|
||||
// Roughly doubles load time for no visible inference benefit
|
||||
// session.options.SetGraphOptimizationLevel(
|
||||
// GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
|
||||
|
||||
session.options.SetGraphOptimizationLevel(
|
||||
GraphOptimizationLevel::ORT_DISABLE_ALL);
|
||||
|
||||
// Slows down performance very slightly
|
||||
// session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
|
||||
|
||||
session.options.DisableCpuMemArena();
|
||||
session.options.DisableMemPattern();
|
||||
session.options.DisableProfiling();
|
||||
|
||||
auto startTime = std::chrono::steady_clock::now();
|
||||
|
||||
#ifdef _WIN32
|
||||
auto modelPathW = std::wstring(modelPath.begin(), modelPath.end());
|
||||
auto modelPathStr = modelPathW.c_str();
|
||||
#else
|
||||
auto modelPathStr = modelPath.c_str();
|
||||
#endif
|
||||
|
||||
session.onnx = Ort::Session(session.env, modelPathStr, session.options);
|
||||
|
||||
auto endTime = std::chrono::steady_clock::now();
|
||||
spdlog::debug("Loaded onnx model in {} second(s)",
|
||||
std::chrono::duration<double>(endTime - startTime).count());
|
||||
}
|
||||
|
||||
// Load Onnx model and JSON config file
|
||||
void loadVoice(PiperConfig &config, std::string modelPath,
|
||||
std::string modelConfigPath, Voice &voice,
|
||||
std::optional<SpeakerId> &speakerId, bool useCuda) {
|
||||
spdlog::debug("Parsing voice config at {}", modelConfigPath);
|
||||
std::ifstream modelConfigFile(modelConfigPath);
|
||||
voice.configRoot = json::parse(modelConfigFile);
|
||||
|
||||
parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
|
||||
parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
|
||||
parseModelConfig(voice.configRoot, voice.modelConfig);
|
||||
|
||||
if (voice.modelConfig.numSpeakers > 1) {
|
||||
// Multi-speaker model
|
||||
if (speakerId) {
|
||||
voice.synthesisConfig.speakerId = speakerId;
|
||||
} else {
|
||||
// Default speaker
|
||||
voice.synthesisConfig.speakerId = 0;
|
||||
}
|
||||
}
|
||||
|
||||
spdlog::debug("Voice contains {} speaker(s)", voice.modelConfig.numSpeakers);
|
||||
|
||||
loadModel(modelPath, voice.session, useCuda);
|
||||
|
||||
} /* loadVoice */
|
||||
|
||||
// Phoneme ids to WAV audio
|
||||
void synthesize(std::vector<PhonemeId> &phonemeIds,
|
||||
SynthesisConfig &synthesisConfig, ModelSession &session,
|
||||
std::vector<int16_t> &audioBuffer, SynthesisResult &result) {
|
||||
spdlog::debug("Synthesizing audio for {} phoneme id(s)", phonemeIds.size());
|
||||
|
||||
auto memoryInfo = Ort::MemoryInfo::CreateCpu(
|
||||
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
|
||||
|
||||
// Allocate
|
||||
std::vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
|
||||
std::vector<float> scales{synthesisConfig.noiseScale,
|
||||
synthesisConfig.lengthScale,
|
||||
synthesisConfig.noiseW};
|
||||
|
||||
std::vector<Ort::Value> inputTensors;
|
||||
std::vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
|
||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||
memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
|
||||
phonemeIdsShape.size()));
|
||||
|
||||
std::vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
|
||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||
memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
|
||||
phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
|
||||
|
||||
std::vector<int64_t> scalesShape{(int64_t)scales.size()};
|
||||
inputTensors.push_back(
|
||||
Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
|
||||
scalesShape.data(), scalesShape.size()));
|
||||
|
||||
// Add speaker id.
|
||||
// NOTE: These must be kept outside the "if" below to avoid being deallocated.
|
||||
std::vector<int64_t> speakerId{
|
||||
(int64_t)synthesisConfig.speakerId.value_or(0)};
|
||||
std::vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
|
||||
|
||||
if (synthesisConfig.speakerId) {
|
||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||
memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
|
||||
speakerIdShape.size()));
|
||||
}
|
||||
|
||||
// From export_onnx.py
|
||||
std::array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
|
||||
"sid"};
|
||||
std::array<const char *, 1> outputNames = {"output"};
|
||||
|
||||
// Infer
|
||||
auto startTime = std::chrono::steady_clock::now();
|
||||
auto outputTensors = session.onnx.Run(
|
||||
Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
|
||||
inputTensors.size(), outputNames.data(), outputNames.size());
|
||||
auto endTime = std::chrono::steady_clock::now();
|
||||
|
||||
if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
|
||||
throw std::runtime_error("Invalid output tensors");
|
||||
}
|
||||
auto inferDuration = std::chrono::duration<double>(endTime - startTime);
|
||||
result.inferSeconds = inferDuration.count();
|
||||
|
||||
const float *audio = outputTensors.front().GetTensorData<float>();
|
||||
auto audioShape =
|
||||
outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
|
||||
int64_t audioCount = audioShape[audioShape.size() - 1];
|
||||
|
||||
result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
|
||||
result.realTimeFactor = 0.0;
|
||||
if (result.audioSeconds > 0) {
|
||||
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
|
||||
}
|
||||
spdlog::debug("Synthesized {} second(s) of audio in {} second(s)",
|
||||
result.audioSeconds, result.inferSeconds);
|
||||
|
||||
// Get max audio value for scaling
|
||||
float maxAudioValue = 0.01f;
|
||||
for (int64_t i = 0; i < audioCount; i++) {
|
||||
float audioValue = abs(audio[i]);
|
||||
if (audioValue > maxAudioValue) {
|
||||
maxAudioValue = audioValue;
|
||||
}
|
||||
}
|
||||
|
||||
// We know the size up front
|
||||
audioBuffer.reserve(audioCount);
|
||||
|
||||
// Scale audio to fill range and convert to int16
|
||||
float audioScale = (MAX_WAV_VALUE / std::max(0.01f, maxAudioValue));
|
||||
for (int64_t i = 0; i < audioCount; i++) {
|
||||
int16_t intAudioValue = static_cast<int16_t>(
|
||||
std::clamp(audio[i] * audioScale,
|
||||
static_cast<float>(std::numeric_limits<int16_t>::min()),
|
||||
static_cast<float>(std::numeric_limits<int16_t>::max())));
|
||||
|
||||
audioBuffer.push_back(intAudioValue);
|
||||
}
|
||||
|
||||
// Clean up
|
||||
for (std::size_t i = 0; i < outputTensors.size(); i++) {
|
||||
Ort::detail::OrtRelease(outputTensors[i].release());
|
||||
}
|
||||
|
||||
for (std::size_t i = 0; i < inputTensors.size(); i++) {
|
||||
Ort::detail::OrtRelease(inputTensors[i].release());
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
// Phonemize text and synthesize audio
|
||||
void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
std::vector<int16_t> &audioBuffer, SynthesisResult &result,
|
||||
const std::function<void()> &audioCallback) {
|
||||
|
||||
std::size_t sentenceSilenceSamples = 0;
|
||||
if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
|
||||
sentenceSilenceSamples = (std::size_t)(
|
||||
voice.synthesisConfig.sentenceSilenceSeconds *
|
||||
voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
|
||||
}
|
||||
|
||||
if (config.useTashkeel) {
|
||||
if (!config.tashkeelState) {
|
||||
throw std::runtime_error("Tashkeel model is not loaded");
|
||||
}
|
||||
|
||||
spdlog::debug("Diacritizing text with libtashkeel: {}", text);
|
||||
text = tashkeel::tashkeel_run(text, *config.tashkeelState);
|
||||
}
|
||||
|
||||
// Phonemes for each sentence
|
||||
spdlog::debug("Phonemizing text: {}", text);
|
||||
std::vector<std::vector<Phoneme>> phonemes;
|
||||
|
||||
if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
|
||||
// Use espeak-ng for phonemization
|
||||
eSpeakPhonemeConfig eSpeakConfig;
|
||||
eSpeakConfig.voice = voice.phonemizeConfig.eSpeak.voice;
|
||||
phonemize_eSpeak(text, eSpeakConfig, phonemes);
|
||||
} else {
|
||||
// Use UTF-8 codepoints as "phonemes"
|
||||
CodepointsPhonemeConfig codepointsConfig;
|
||||
phonemize_codepoints(text, codepointsConfig, phonemes);
|
||||
}
|
||||
|
||||
// Synthesize each sentence independently.
|
||||
std::vector<PhonemeId> phonemeIds;
|
||||
std::map<Phoneme, std::size_t> missingPhonemes;
|
||||
for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
|
||||
++phonemesIter) {
|
||||
std::vector<Phoneme> &sentencePhonemes = *phonemesIter;
|
||||
|
||||
if (spdlog::should_log(spdlog::level::debug)) {
|
||||
// DEBUG log for phonemes
|
||||
std::string phonemesStr;
|
||||
for (auto phoneme : sentencePhonemes) {
|
||||
utf8::append(phoneme, std::back_inserter(phonemesStr));
|
||||
}
|
||||
|
||||
spdlog::debug("Converting {} phoneme(s) to ids: {}",
|
||||
sentencePhonemes.size(), phonemesStr);
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<std::vector<Phoneme>>> phrasePhonemes;
|
||||
std::vector<SynthesisResult> phraseResults;
|
||||
std::vector<size_t> phraseSilenceSamples;
|
||||
|
||||
// Use phoneme/id map from config
|
||||
PhonemeIdConfig idConfig;
|
||||
idConfig.phonemeIdMap =
|
||||
std::make_shared<PhonemeIdMap>(voice.phonemizeConfig.phonemeIdMap);
|
||||
|
||||
if (voice.synthesisConfig.phonemeSilenceSeconds) {
|
||||
// Split into phrases
|
||||
std::map<Phoneme, float> &phonemeSilenceSeconds =
|
||||
*voice.synthesisConfig.phonemeSilenceSeconds;
|
||||
|
||||
auto currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
|
||||
phrasePhonemes.push_back(currentPhrasePhonemes);
|
||||
|
||||
for (auto sentencePhonemesIter = sentencePhonemes.begin();
|
||||
sentencePhonemesIter != sentencePhonemes.end();
|
||||
sentencePhonemesIter++) {
|
||||
Phoneme ¤tPhoneme = *sentencePhonemesIter;
|
||||
currentPhrasePhonemes->push_back(currentPhoneme);
|
||||
|
||||
if (phonemeSilenceSeconds.count(currentPhoneme) > 0) {
|
||||
// Split at phrase boundary
|
||||
phraseSilenceSamples.push_back(
|
||||
(std::size_t)(phonemeSilenceSeconds[currentPhoneme] *
|
||||
voice.synthesisConfig.sampleRate *
|
||||
voice.synthesisConfig.channels));
|
||||
|
||||
currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
|
||||
phrasePhonemes.push_back(currentPhrasePhonemes);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Use all phonemes
|
||||
phrasePhonemes.push_back(
|
||||
std::make_shared<std::vector<Phoneme>>(sentencePhonemes));
|
||||
}
|
||||
|
||||
// Ensure results/samples are the same size
|
||||
while (phraseResults.size() < phrasePhonemes.size()) {
|
||||
phraseResults.emplace_back();
|
||||
}
|
||||
|
||||
while (phraseSilenceSamples.size() < phrasePhonemes.size()) {
|
||||
phraseSilenceSamples.push_back(0);
|
||||
}
|
||||
|
||||
// phonemes -> ids -> audio
|
||||
for (size_t phraseIdx = 0; phraseIdx < phrasePhonemes.size(); phraseIdx++) {
|
||||
if (phrasePhonemes[phraseIdx]->size() <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// phonemes -> ids
|
||||
phonemes_to_ids(*(phrasePhonemes[phraseIdx]), idConfig, phonemeIds,
|
||||
missingPhonemes);
|
||||
if (spdlog::should_log(spdlog::level::debug)) {
|
||||
// DEBUG log for phoneme ids
|
||||
std::stringstream phonemeIdsStr;
|
||||
for (auto phonemeId : phonemeIds) {
|
||||
phonemeIdsStr << phonemeId << ", ";
|
||||
}
|
||||
|
||||
spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
|
||||
phrasePhonemes[phraseIdx]->size(), phonemeIds.size(),
|
||||
phonemeIdsStr.str());
|
||||
}
|
||||
|
||||
// ids -> audio
|
||||
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
|
||||
phraseResults[phraseIdx]);
|
||||
|
||||
// Add end of phrase silence
|
||||
for (std::size_t i = 0; i < phraseSilenceSamples[phraseIdx]; i++) {
|
||||
audioBuffer.push_back(0);
|
||||
}
|
||||
|
||||
result.audioSeconds += phraseResults[phraseIdx].audioSeconds;
|
||||
result.inferSeconds += phraseResults[phraseIdx].inferSeconds;
|
||||
|
||||
phonemeIds.clear();
|
||||
}
|
||||
|
||||
// Add end of sentence silence
|
||||
if (sentenceSilenceSamples > 0) {
|
||||
for (std::size_t i = 0; i < sentenceSilenceSamples; i++) {
|
||||
audioBuffer.push_back(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (audioCallback) {
|
||||
// Call back must copy audio since it is cleared afterwards.
|
||||
audioCallback();
|
||||
audioBuffer.clear();
|
||||
}
|
||||
|
||||
phonemeIds.clear();
|
||||
}
|
||||
|
||||
if (missingPhonemes.size() > 0) {
|
||||
spdlog::warn("Missing {} phoneme(s) from phoneme/id map!",
|
||||
missingPhonemes.size());
|
||||
|
||||
for (auto phonemeCount : missingPhonemes) {
|
||||
std::string phonemeStr;
|
||||
utf8::append(phonemeCount.first, std::back_inserter(phonemeStr));
|
||||
spdlog::warn("Missing \"{}\" (\\u{:04X}): {} time(s)", phonemeStr,
|
||||
(uint32_t)phonemeCount.first, phonemeCount.second);
|
||||
}
|
||||
}
|
||||
|
||||
if (result.audioSeconds > 0) {
|
||||
result.realTimeFactor = result.inferSeconds / result.audioSeconds;
|
||||
}
|
||||
|
||||
} /* textToAudio */
|
||||
|
||||
// Phonemize text and synthesize audio to WAV file
|
||||
void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
|
||||
std::ostream &audioFile, SynthesisResult &result) {
|
||||
|
||||
std::vector<int16_t> audioBuffer;
|
||||
textToAudio(config, voice, text, audioBuffer, result, NULL);
|
||||
|
||||
// Write WAV
|
||||
auto synthesisConfig = voice.synthesisConfig;
|
||||
writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
|
||||
synthesisConfig.channels, (int32_t)audioBuffer.size(),
|
||||
audioFile);
|
||||
|
||||
audioFile.write((const char *)audioBuffer.data(),
|
||||
sizeof(int16_t) * audioBuffer.size());
|
||||
|
||||
} /* textToWavFile */
|
||||
|
||||
} // namespace piper
|
||||
132
mlu_370-piper/piper/src/cpp/piper.hpp
Normal file
132
mlu_370-piper/piper/src/cpp/piper.hpp
Normal file
@@ -0,0 +1,132 @@
|
||||
#ifndef PIPER_H_
|
||||
#define PIPER_H_
|
||||
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <onnxruntime_cxx_api.h>
|
||||
#include <piper-phonemize/phoneme_ids.hpp>
|
||||
#include <piper-phonemize/phonemize.hpp>
|
||||
#include <piper-phonemize/tashkeel.hpp>
|
||||
|
||||
#include "json.hpp"
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
namespace piper {
|
||||
|
||||
typedef int64_t SpeakerId;
|
||||
|
||||
struct eSpeakConfig {
|
||||
std::string voice = "en-us";
|
||||
};
|
||||
|
||||
struct PiperConfig {
|
||||
std::string eSpeakDataPath;
|
||||
bool useESpeak = true;
|
||||
|
||||
bool useTashkeel = false;
|
||||
std::optional<std::string> tashkeelModelPath;
|
||||
std::unique_ptr<tashkeel::State> tashkeelState;
|
||||
};
|
||||
|
||||
enum PhonemeType { eSpeakPhonemes, TextPhonemes };
|
||||
|
||||
struct PhonemizeConfig {
|
||||
PhonemeType phonemeType = eSpeakPhonemes;
|
||||
std::optional<std::map<Phoneme, std::vector<Phoneme>>> phonemeMap;
|
||||
std::map<Phoneme, std::vector<PhonemeId>> phonemeIdMap;
|
||||
|
||||
PhonemeId idPad = 0; // padding (optionally interspersed)
|
||||
PhonemeId idBos = 1; // beginning of sentence
|
||||
PhonemeId idEos = 2; // end of sentence
|
||||
bool interspersePad = true;
|
||||
|
||||
eSpeakConfig eSpeak;
|
||||
};
|
||||
|
||||
struct SynthesisConfig {
|
||||
// VITS inference settings
|
||||
float noiseScale = 0.667f;
|
||||
float lengthScale = 1.0f;
|
||||
float noiseW = 0.8f;
|
||||
|
||||
// Audio settings
|
||||
int sampleRate = 22050;
|
||||
int sampleWidth = 2; // 16-bit
|
||||
int channels = 1; // mono
|
||||
|
||||
// Speaker id from 0 to numSpeakers - 1
|
||||
std::optional<SpeakerId> speakerId;
|
||||
|
||||
// Extra silence
|
||||
float sentenceSilenceSeconds = 0.2f;
|
||||
std::optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
|
||||
};
|
||||
|
||||
struct ModelConfig {
|
||||
int numSpeakers;
|
||||
|
||||
// speaker name -> id
|
||||
std::optional<std::map<std::string, SpeakerId>> speakerIdMap;
|
||||
};
|
||||
|
||||
struct ModelSession {
|
||||
Ort::Session onnx;
|
||||
Ort::AllocatorWithDefaultOptions allocator;
|
||||
Ort::SessionOptions options;
|
||||
Ort::Env env;
|
||||
|
||||
ModelSession() : onnx(nullptr){};
|
||||
};
|
||||
|
||||
struct SynthesisResult {
|
||||
double inferSeconds;
|
||||
double audioSeconds;
|
||||
double realTimeFactor;
|
||||
};
|
||||
|
||||
struct Voice {
|
||||
json configRoot;
|
||||
PhonemizeConfig phonemizeConfig;
|
||||
SynthesisConfig synthesisConfig;
|
||||
ModelConfig modelConfig;
|
||||
ModelSession session;
|
||||
};
|
||||
|
||||
// True if the string is a single UTF-8 codepoint
|
||||
bool isSingleCodepoint(std::string s);
|
||||
|
||||
// Get the first UTF-8 codepoint of a string
|
||||
Phoneme getCodepoint(std::string s);
|
||||
|
||||
// Get version of Piper
|
||||
std::string getVersion();
|
||||
|
||||
// Must be called before using textTo* functions
|
||||
void initialize(PiperConfig &config);
|
||||
|
||||
// Clean up
|
||||
void terminate(PiperConfig &config);
|
||||
|
||||
// Load Onnx model and JSON config file
|
||||
void loadVoice(PiperConfig &config, std::string modelPath,
|
||||
std::string modelConfigPath, Voice &voice,
|
||||
std::optional<SpeakerId> &speakerId, bool useCuda);
|
||||
|
||||
// Phonemize text and synthesize audio
|
||||
void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
||||
std::vector<int16_t> &audioBuffer, SynthesisResult &result,
|
||||
const std::function<void()> &audioCallback);
|
||||
|
||||
// Phonemize text and synthesize audio to WAV file
|
||||
void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
|
||||
std::ostream &audioFile, SynthesisResult &result);
|
||||
|
||||
} // namespace piper
|
||||
|
||||
#endif // PIPER_H_
|
||||
60
mlu_370-piper/piper/src/cpp/test.cpp
Normal file
60
mlu_370-piper/piper/src/cpp/test.cpp
Normal file
@@ -0,0 +1,60 @@
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <optional>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "json.hpp"
|
||||
#include "piper.hpp"
|
||||
|
||||
using namespace std;
|
||||
using json = nlohmann::json;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
piper::PiperConfig piperConfig;
|
||||
piper::Voice voice;
|
||||
|
||||
if (argc < 2) {
|
||||
std::cerr << "Need voice model path" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (argc < 3) {
|
||||
std::cerr << "Need espeak-ng-data path" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (argc < 4) {
|
||||
std::cerr << "Need output WAV path" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto modelPath = std::string(argv[1]);
|
||||
piperConfig.eSpeakDataPath = std::string(argv[2]);
|
||||
auto outputPath = std::string(argv[3]);
|
||||
|
||||
optional<piper::SpeakerId> speakerId;
|
||||
loadVoice(piperConfig, modelPath, modelPath + ".json", voice, speakerId,
|
||||
false);
|
||||
piper::initialize(piperConfig);
|
||||
|
||||
// Output audio to WAV file
|
||||
ofstream audioFile(outputPath, ios::binary);
|
||||
|
||||
piper::SynthesisResult result;
|
||||
piper::textToWavFile(piperConfig, voice, "This is a test.", audioFile,
|
||||
result);
|
||||
piper::terminate(piperConfig);
|
||||
|
||||
// Verify that file has some data
|
||||
if (audioFile.tellp() < 10000) {
|
||||
std::cerr << "ERROR: Output file is smaller than expected!" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
std::cout << "OK" << std::endl;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
34
mlu_370-piper/piper/src/cpp/utf8.h
Normal file
34
mlu_370-piper/piper/src/cpp/utf8.h
Normal file
@@ -0,0 +1,34 @@
|
||||
// Copyright 2006 Nemanja Trifunovic
|
||||
|
||||
/*
|
||||
Permission is hereby granted, free of charge, to any person or organization
|
||||
obtaining a copy of the software and accompanying documentation covered by
|
||||
this license (the "Software") to use, reproduce, display, distribute,
|
||||
execute, and transmit the Software, and to prepare derivative works of the
|
||||
Software, and to permit third-parties to whom the Software is furnished to
|
||||
do so, all subject to the following:
|
||||
|
||||
The copyright notices in the Software and this entire statement, including
|
||||
the above license grant, this restriction and the following disclaimer,
|
||||
must be included in all copies of the Software, in whole or in part, and
|
||||
all derivative works of the Software, unless such copies or derivative
|
||||
works are solely in the form of machine-executable object code generated by
|
||||
a source language processor.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||
#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||
|
||||
#include "utf8/checked.h"
|
||||
#include "utf8/unchecked.h"
|
||||
|
||||
#endif // header guard
|
||||
335
mlu_370-piper/piper/src/cpp/utf8/checked.h
Normal file
335
mlu_370-piper/piper/src/cpp/utf8/checked.h
Normal file
@@ -0,0 +1,335 @@
|
||||
// Copyright 2006-2016 Nemanja Trifunovic
|
||||
|
||||
/*
|
||||
Permission is hereby granted, free of charge, to any person or organization
|
||||
obtaining a copy of the software and accompanying documentation covered by
|
||||
this license (the "Software") to use, reproduce, display, distribute,
|
||||
execute, and transmit the Software, and to prepare derivative works of the
|
||||
Software, and to permit third-parties to whom the Software is furnished to
|
||||
do so, all subject to the following:
|
||||
|
||||
The copyright notices in the Software and this entire statement, including
|
||||
the above license grant, this restriction and the following disclaimer,
|
||||
must be included in all copies of the Software, in whole or in part, and
|
||||
all derivative works of the Software, unless such copies or derivative
|
||||
works are solely in the form of machine-executable object code generated by
|
||||
a source language processor.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||
#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||
|
||||
#include "core.h"
|
||||
#include <stdexcept>
|
||||
|
||||
namespace utf8
|
||||
{
|
||||
// Base for the exceptions that may be thrown from the library
|
||||
class exception : public ::std::exception {
|
||||
};
|
||||
|
||||
// Exceptions that may be thrown from the library functions.
|
||||
class invalid_code_point : public exception {
|
||||
uint32_t cp;
|
||||
public:
|
||||
invalid_code_point(uint32_t codepoint) : cp(codepoint) {}
|
||||
virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; }
|
||||
uint32_t code_point() const {return cp;}
|
||||
};
|
||||
|
||||
class invalid_utf8 : public exception {
|
||||
uint8_t u8;
|
||||
public:
|
||||
invalid_utf8 (uint8_t u) : u8(u) {}
|
||||
virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; }
|
||||
uint8_t utf8_octet() const {return u8;}
|
||||
};
|
||||
|
||||
class invalid_utf16 : public exception {
|
||||
uint16_t u16;
|
||||
public:
|
||||
invalid_utf16 (uint16_t u) : u16(u) {}
|
||||
virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; }
|
||||
uint16_t utf16_word() const {return u16;}
|
||||
};
|
||||
|
||||
class not_enough_room : public exception {
|
||||
public:
|
||||
virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; }
|
||||
};
|
||||
|
||||
/// The library API - functions intended to be called by the users
|
||||
|
||||
template <typename octet_iterator>
|
||||
octet_iterator append(uint32_t cp, octet_iterator result)
|
||||
{
|
||||
if (!utf8::internal::is_code_point_valid(cp))
|
||||
throw invalid_code_point(cp);
|
||||
|
||||
if (cp < 0x80) // one octet
|
||||
*(result++) = static_cast<uint8_t>(cp);
|
||||
else if (cp < 0x800) { // two octets
|
||||
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
|
||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
}
|
||||
else if (cp < 0x10000) { // three octets
|
||||
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
||||
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
}
|
||||
else { // four octets
|
||||
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
||||
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
|
||||
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename output_iterator>
|
||||
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
|
||||
{
|
||||
while (start != end) {
|
||||
octet_iterator sequence_start = start;
|
||||
internal::utf_error err_code = utf8::internal::validate_next(start, end);
|
||||
switch (err_code) {
|
||||
case internal::UTF8_OK :
|
||||
for (octet_iterator it = sequence_start; it != start; ++it)
|
||||
*out++ = *it;
|
||||
break;
|
||||
case internal::NOT_ENOUGH_ROOM:
|
||||
out = utf8::append (replacement, out);
|
||||
start = end;
|
||||
break;
|
||||
case internal::INVALID_LEAD:
|
||||
out = utf8::append (replacement, out);
|
||||
++start;
|
||||
break;
|
||||
case internal::INCOMPLETE_SEQUENCE:
|
||||
case internal::OVERLONG_SEQUENCE:
|
||||
case internal::INVALID_CODE_POINT:
|
||||
out = utf8::append (replacement, out);
|
||||
++start;
|
||||
// just one replacement mark for the sequence
|
||||
while (start != end && utf8::internal::is_trail(*start))
|
||||
++start;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename output_iterator>
|
||||
inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
|
||||
{
|
||||
static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
|
||||
return utf8::replace_invalid(start, end, out, replacement_marker);
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
uint32_t next(octet_iterator& it, octet_iterator end)
|
||||
{
|
||||
uint32_t cp = 0;
|
||||
internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
|
||||
switch (err_code) {
|
||||
case internal::UTF8_OK :
|
||||
break;
|
||||
case internal::NOT_ENOUGH_ROOM :
|
||||
throw not_enough_room();
|
||||
case internal::INVALID_LEAD :
|
||||
case internal::INCOMPLETE_SEQUENCE :
|
||||
case internal::OVERLONG_SEQUENCE :
|
||||
throw invalid_utf8(*it);
|
||||
case internal::INVALID_CODE_POINT :
|
||||
throw invalid_code_point(cp);
|
||||
}
|
||||
return cp;
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
uint32_t peek_next(octet_iterator it, octet_iterator end)
|
||||
{
|
||||
return utf8::next(it, end);
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
uint32_t prior(octet_iterator& it, octet_iterator start)
|
||||
{
|
||||
// can't do much if it == start
|
||||
if (it == start)
|
||||
throw not_enough_room();
|
||||
|
||||
octet_iterator end = it;
|
||||
// Go back until we hit either a lead octet or start
|
||||
while (utf8::internal::is_trail(*(--it)))
|
||||
if (it == start)
|
||||
throw invalid_utf8(*it); // error - no lead byte in the sequence
|
||||
return utf8::peek_next(it, end);
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename distance_type>
|
||||
void advance (octet_iterator& it, distance_type n, octet_iterator end)
|
||||
{
|
||||
const distance_type zero(0);
|
||||
if (n < zero) {
|
||||
// backward
|
||||
for (distance_type i = n; i < zero; ++i)
|
||||
utf8::prior(it, end);
|
||||
} else {
|
||||
// forward
|
||||
for (distance_type i = zero; i < n; ++i)
|
||||
utf8::next(it, end);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
typename std::iterator_traits<octet_iterator>::difference_type
|
||||
distance (octet_iterator first, octet_iterator last)
|
||||
{
|
||||
typename std::iterator_traits<octet_iterator>::difference_type dist;
|
||||
for (dist = 0; first < last; ++dist)
|
||||
utf8::next(first, last);
|
||||
return dist;
|
||||
}
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||
{
|
||||
while (start != end) {
|
||||
uint32_t cp = utf8::internal::mask16(*start++);
|
||||
// Take care of surrogate pairs first
|
||||
if (utf8::internal::is_lead_surrogate(cp)) {
|
||||
if (start != end) {
|
||||
uint32_t trail_surrogate = utf8::internal::mask16(*start++);
|
||||
if (utf8::internal::is_trail_surrogate(trail_surrogate))
|
||||
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
|
||||
else
|
||||
throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
|
||||
}
|
||||
else
|
||||
throw invalid_utf16(static_cast<uint16_t>(cp));
|
||||
|
||||
}
|
||||
// Lone trail surrogate
|
||||
else if (utf8::internal::is_trail_surrogate(cp))
|
||||
throw invalid_utf16(static_cast<uint16_t>(cp));
|
||||
|
||||
result = utf8::append(cp, result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
|
||||
{
|
||||
while (start < end) {
|
||||
uint32_t cp = utf8::next(start, end);
|
||||
if (cp > 0xffff) { //make a surrogate pair
|
||||
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
|
||||
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
|
||||
}
|
||||
else
|
||||
*result++ = static_cast<uint16_t>(cp);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename u32bit_iterator>
|
||||
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
|
||||
{
|
||||
while (start != end)
|
||||
result = utf8::append(*(start++), result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename u32bit_iterator>
|
||||
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
|
||||
{
|
||||
while (start < end)
|
||||
(*result++) = utf8::next(start, end);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// The iterator class
|
||||
template <typename octet_iterator>
|
||||
class iterator {
|
||||
octet_iterator it;
|
||||
octet_iterator range_start;
|
||||
octet_iterator range_end;
|
||||
public:
|
||||
typedef uint32_t value_type;
|
||||
typedef uint32_t* pointer;
|
||||
typedef uint32_t& reference;
|
||||
typedef std::ptrdiff_t difference_type;
|
||||
typedef std::bidirectional_iterator_tag iterator_category;
|
||||
iterator () {}
|
||||
explicit iterator (const octet_iterator& octet_it,
|
||||
const octet_iterator& rangestart,
|
||||
const octet_iterator& rangeend) :
|
||||
it(octet_it), range_start(rangestart), range_end(rangeend)
|
||||
{
|
||||
if (it < range_start || it > range_end)
|
||||
throw std::out_of_range("Invalid utf-8 iterator position");
|
||||
}
|
||||
// the default "big three" are OK
|
||||
octet_iterator base () const { return it; }
|
||||
uint32_t operator * () const
|
||||
{
|
||||
octet_iterator temp = it;
|
||||
return utf8::next(temp, range_end);
|
||||
}
|
||||
bool operator == (const iterator& rhs) const
|
||||
{
|
||||
if (range_start != rhs.range_start || range_end != rhs.range_end)
|
||||
throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
|
||||
return (it == rhs.it);
|
||||
}
|
||||
bool operator != (const iterator& rhs) const
|
||||
{
|
||||
return !(operator == (rhs));
|
||||
}
|
||||
iterator& operator ++ ()
|
||||
{
|
||||
utf8::next(it, range_end);
|
||||
return *this;
|
||||
}
|
||||
iterator operator ++ (int)
|
||||
{
|
||||
iterator temp = *this;
|
||||
utf8::next(it, range_end);
|
||||
return temp;
|
||||
}
|
||||
iterator& operator -- ()
|
||||
{
|
||||
utf8::prior(it, range_start);
|
||||
return *this;
|
||||
}
|
||||
iterator operator -- (int)
|
||||
{
|
||||
iterator temp = *this;
|
||||
utf8::prior(it, range_start);
|
||||
return temp;
|
||||
}
|
||||
}; // class iterator
|
||||
|
||||
} // namespace utf8
|
||||
|
||||
#if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later
|
||||
#include "cpp17.h"
|
||||
#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
|
||||
#include "cpp11.h"
|
||||
#endif // C++ 11 or later
|
||||
|
||||
#endif //header guard
|
||||
|
||||
338
mlu_370-piper/piper/src/cpp/utf8/core.h
Normal file
338
mlu_370-piper/piper/src/cpp/utf8/core.h
Normal file
@@ -0,0 +1,338 @@
|
||||
// Copyright 2006 Nemanja Trifunovic
|
||||
|
||||
/*
|
||||
Permission is hereby granted, free of charge, to any person or organization
|
||||
obtaining a copy of the software and accompanying documentation covered by
|
||||
this license (the "Software") to use, reproduce, display, distribute,
|
||||
execute, and transmit the Software, and to prepare derivative works of the
|
||||
Software, and to permit third-parties to whom the Software is furnished to
|
||||
do so, all subject to the following:
|
||||
|
||||
The copyright notices in the Software and this entire statement, including
|
||||
the above license grant, this restriction and the following disclaimer,
|
||||
must be included in all copies of the Software, in whole or in part, and
|
||||
all derivative works of the Software, unless such copies or derivative
|
||||
works are solely in the form of machine-executable object code generated by
|
||||
a source language processor.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||
#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||
|
||||
#include <iterator>
|
||||
|
||||
// Determine the C++ standard version.
|
||||
// If the user defines UTF_CPP_CPLUSPLUS, use that.
|
||||
// Otherwise, trust the unreliable predefined macro __cplusplus
|
||||
|
||||
#if !defined UTF_CPP_CPLUSPLUS
|
||||
#define UTF_CPP_CPLUSPLUS __cplusplus
|
||||
#endif
|
||||
|
||||
#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
|
||||
#define UTF_CPP_OVERRIDE override
|
||||
#define UTF_CPP_NOEXCEPT noexcept
|
||||
#else // C++ 98/03
|
||||
#define UTF_CPP_OVERRIDE
|
||||
#define UTF_CPP_NOEXCEPT throw()
|
||||
#endif // C++ 11 or later
|
||||
|
||||
|
||||
namespace utf8
|
||||
{
|
||||
// The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
|
||||
// You may need to change them to match your system.
|
||||
// These typedefs have the same names as ones from cstdint, or boost/cstdint
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
|
||||
// Helper code - not intended to be directly called by the library users. May be changed at any time
|
||||
namespace internal
|
||||
{
|
||||
// Unicode constants
|
||||
// Leading (high) surrogates: 0xd800 - 0xdbff
|
||||
// Trailing (low) surrogates: 0xdc00 - 0xdfff
|
||||
const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
|
||||
const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
|
||||
const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
|
||||
const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
|
||||
const uint16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10)
|
||||
const uint32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
|
||||
|
||||
// Maximum valid value for a Unicode code point
|
||||
const uint32_t CODE_POINT_MAX = 0x0010ffffu;
|
||||
|
||||
template<typename octet_type>
|
||||
inline uint8_t mask8(octet_type oc)
|
||||
{
|
||||
return static_cast<uint8_t>(0xff & oc);
|
||||
}
|
||||
template<typename u16_type>
|
||||
inline uint16_t mask16(u16_type oc)
|
||||
{
|
||||
return static_cast<uint16_t>(0xffff & oc);
|
||||
}
|
||||
template<typename octet_type>
|
||||
inline bool is_trail(octet_type oc)
|
||||
{
|
||||
return ((utf8::internal::mask8(oc) >> 6) == 0x2);
|
||||
}
|
||||
|
||||
template <typename u16>
|
||||
inline bool is_lead_surrogate(u16 cp)
|
||||
{
|
||||
return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
|
||||
}
|
||||
|
||||
template <typename u16>
|
||||
inline bool is_trail_surrogate(u16 cp)
|
||||
{
|
||||
return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
|
||||
}
|
||||
|
||||
template <typename u16>
|
||||
inline bool is_surrogate(u16 cp)
|
||||
{
|
||||
return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
|
||||
}
|
||||
|
||||
template <typename u32>
|
||||
inline bool is_code_point_valid(u32 cp)
|
||||
{
|
||||
return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
inline typename std::iterator_traits<octet_iterator>::difference_type
|
||||
sequence_length(octet_iterator lead_it)
|
||||
{
|
||||
uint8_t lead = utf8::internal::mask8(*lead_it);
|
||||
if (lead < 0x80)
|
||||
return 1;
|
||||
else if ((lead >> 5) == 0x6)
|
||||
return 2;
|
||||
else if ((lead >> 4) == 0xe)
|
||||
return 3;
|
||||
else if ((lead >> 3) == 0x1e)
|
||||
return 4;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename octet_difference_type>
|
||||
inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
|
||||
{
|
||||
if (cp < 0x80) {
|
||||
if (length != 1)
|
||||
return true;
|
||||
}
|
||||
else if (cp < 0x800) {
|
||||
if (length != 2)
|
||||
return true;
|
||||
}
|
||||
else if (cp < 0x10000) {
|
||||
if (length != 3)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
|
||||
|
||||
/// Helper for get_sequence_x
|
||||
template <typename octet_iterator>
|
||||
utf_error increase_safely(octet_iterator& it, octet_iterator end)
|
||||
{
|
||||
if (++it == end)
|
||||
return NOT_ENOUGH_ROOM;
|
||||
|
||||
if (!utf8::internal::is_trail(*it))
|
||||
return INCOMPLETE_SEQUENCE;
|
||||
|
||||
return UTF8_OK;
|
||||
}
|
||||
|
||||
#define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
|
||||
|
||||
/// get_sequence_x functions decode utf-8 sequences of the length x
|
||||
template <typename octet_iterator>
|
||||
utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
|
||||
{
|
||||
if (it == end)
|
||||
return NOT_ENOUGH_ROOM;
|
||||
|
||||
code_point = utf8::internal::mask8(*it);
|
||||
|
||||
return UTF8_OK;
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
|
||||
{
|
||||
if (it == end)
|
||||
return NOT_ENOUGH_ROOM;
|
||||
|
||||
code_point = utf8::internal::mask8(*it);
|
||||
|
||||
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
|
||||
|
||||
code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
|
||||
|
||||
return UTF8_OK;
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
|
||||
{
|
||||
if (it == end)
|
||||
return NOT_ENOUGH_ROOM;
|
||||
|
||||
code_point = utf8::internal::mask8(*it);
|
||||
|
||||
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
|
||||
|
||||
code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
|
||||
|
||||
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
|
||||
|
||||
code_point += (*it) & 0x3f;
|
||||
|
||||
return UTF8_OK;
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
|
||||
{
|
||||
if (it == end)
|
||||
return NOT_ENOUGH_ROOM;
|
||||
|
||||
code_point = utf8::internal::mask8(*it);
|
||||
|
||||
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
|
||||
|
||||
code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
|
||||
|
||||
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
|
||||
|
||||
code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
|
||||
|
||||
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
|
||||
|
||||
code_point += (*it) & 0x3f;
|
||||
|
||||
return UTF8_OK;
|
||||
}
|
||||
|
||||
#undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
|
||||
|
||||
template <typename octet_iterator>
|
||||
utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
|
||||
{
|
||||
if (it == end)
|
||||
return NOT_ENOUGH_ROOM;
|
||||
|
||||
// Save the original value of it so we can go back in case of failure
|
||||
// Of course, it does not make much sense with i.e. stream iterators
|
||||
octet_iterator original_it = it;
|
||||
|
||||
uint32_t cp = 0;
|
||||
// Determine the sequence length based on the lead octet
|
||||
typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
|
||||
const octet_difference_type length = utf8::internal::sequence_length(it);
|
||||
|
||||
// Get trail octets and calculate the code point
|
||||
utf_error err = UTF8_OK;
|
||||
switch (length) {
|
||||
case 0:
|
||||
return INVALID_LEAD;
|
||||
case 1:
|
||||
err = utf8::internal::get_sequence_1(it, end, cp);
|
||||
break;
|
||||
case 2:
|
||||
err = utf8::internal::get_sequence_2(it, end, cp);
|
||||
break;
|
||||
case 3:
|
||||
err = utf8::internal::get_sequence_3(it, end, cp);
|
||||
break;
|
||||
case 4:
|
||||
err = utf8::internal::get_sequence_4(it, end, cp);
|
||||
break;
|
||||
}
|
||||
|
||||
if (err == UTF8_OK) {
|
||||
// Decoding succeeded. Now, security checks...
|
||||
if (utf8::internal::is_code_point_valid(cp)) {
|
||||
if (!utf8::internal::is_overlong_sequence(cp, length)){
|
||||
// Passed! Return here.
|
||||
code_point = cp;
|
||||
++it;
|
||||
return UTF8_OK;
|
||||
}
|
||||
else
|
||||
err = OVERLONG_SEQUENCE;
|
||||
}
|
||||
else
|
||||
err = INVALID_CODE_POINT;
|
||||
}
|
||||
|
||||
// Failure branch - restore the original value of the iterator
|
||||
it = original_it;
|
||||
return err;
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
|
||||
uint32_t ignored;
|
||||
return utf8::internal::validate_next(it, end, ignored);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// The library API - functions intended to be called by the users
|
||||
|
||||
// Byte order mark
|
||||
const uint8_t bom[] = {0xef, 0xbb, 0xbf};
|
||||
|
||||
template <typename octet_iterator>
|
||||
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
|
||||
{
|
||||
octet_iterator result = start;
|
||||
while (result != end) {
|
||||
utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
|
||||
if (err_code != internal::UTF8_OK)
|
||||
return result;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
inline bool is_valid(octet_iterator start, octet_iterator end)
|
||||
{
|
||||
return (utf8::find_invalid(start, end) == end);
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
inline bool starts_with_bom (octet_iterator it, octet_iterator end)
|
||||
{
|
||||
return (
|
||||
((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
|
||||
((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
|
||||
((it != end) && (utf8::internal::mask8(*it)) == bom[2])
|
||||
);
|
||||
}
|
||||
} // namespace utf8
|
||||
|
||||
#endif // header guard
|
||||
|
||||
|
||||
103
mlu_370-piper/piper/src/cpp/utf8/cpp11.h
Normal file
103
mlu_370-piper/piper/src/cpp/utf8/cpp11.h
Normal file
@@ -0,0 +1,103 @@
|
||||
// Copyright 2018 Nemanja Trifunovic
|
||||
|
||||
/*
|
||||
Permission is hereby granted, free of charge, to any person or organization
|
||||
obtaining a copy of the software and accompanying documentation covered by
|
||||
this license (the "Software") to use, reproduce, display, distribute,
|
||||
execute, and transmit the Software, and to prepare derivative works of the
|
||||
Software, and to permit third-parties to whom the Software is furnished to
|
||||
do so, all subject to the following:
|
||||
|
||||
The copyright notices in the Software and this entire statement, including
|
||||
the above license grant, this restriction and the following disclaimer,
|
||||
must be included in all copies of the Software, in whole or in part, and
|
||||
all derivative works of the Software, unless such copies or derivative
|
||||
works are solely in the form of machine-executable object code generated by
|
||||
a source language processor.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
|
||||
#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
|
||||
|
||||
#include "checked.h"
|
||||
#include <string>
|
||||
|
||||
namespace utf8
|
||||
{
|
||||
|
||||
inline void append(char32_t cp, std::string& s)
|
||||
{
|
||||
append(uint32_t(cp), std::back_inserter(s));
|
||||
}
|
||||
|
||||
inline std::string utf16to8(const std::u16string& s)
|
||||
{
|
||||
std::string result;
|
||||
utf16to8(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::u16string utf8to16(const std::string& s)
|
||||
{
|
||||
std::u16string result;
|
||||
utf8to16(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::string utf32to8(const std::u32string& s)
|
||||
{
|
||||
std::string result;
|
||||
utf32to8(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::u32string utf8to32(const std::string& s)
|
||||
{
|
||||
std::u32string result;
|
||||
utf8to32(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::size_t find_invalid(const std::string& s)
|
||||
{
|
||||
std::string::const_iterator invalid = find_invalid(s.begin(), s.end());
|
||||
return (invalid == s.end()) ? std::string::npos : (invalid - s.begin());
|
||||
}
|
||||
|
||||
inline bool is_valid(const std::string& s)
|
||||
{
|
||||
return is_valid(s.begin(), s.end());
|
||||
}
|
||||
|
||||
inline std::string replace_invalid(const std::string& s, char32_t replacement)
|
||||
{
|
||||
std::string result;
|
||||
replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::string replace_invalid(const std::string& s)
|
||||
{
|
||||
std::string result;
|
||||
replace_invalid(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline bool starts_with_bom(const std::string& s)
|
||||
{
|
||||
return starts_with_bom(s.begin(), s.end());
|
||||
}
|
||||
|
||||
} // namespace utf8
|
||||
|
||||
#endif // header guard
|
||||
|
||||
103
mlu_370-piper/piper/src/cpp/utf8/cpp17.h
Normal file
103
mlu_370-piper/piper/src/cpp/utf8/cpp17.h
Normal file
@@ -0,0 +1,103 @@
|
||||
// Copyright 2018 Nemanja Trifunovic
|
||||
|
||||
/*
|
||||
Permission is hereby granted, free of charge, to any person or organization
|
||||
obtaining a copy of the software and accompanying documentation covered by
|
||||
this license (the "Software") to use, reproduce, display, distribute,
|
||||
execute, and transmit the Software, and to prepare derivative works of the
|
||||
Software, and to permit third-parties to whom the Software is furnished to
|
||||
do so, all subject to the following:
|
||||
|
||||
The copyright notices in the Software and this entire statement, including
|
||||
the above license grant, this restriction and the following disclaimer,
|
||||
must be included in all copies of the Software, in whole or in part, and
|
||||
all derivative works of the Software, unless such copies or derivative
|
||||
works are solely in the form of machine-executable object code generated by
|
||||
a source language processor.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
|
||||
#define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
|
||||
|
||||
#include "checked.h"
|
||||
#include <string>
|
||||
|
||||
namespace utf8
|
||||
{
|
||||
|
||||
inline void append(char32_t cp, std::string& s)
|
||||
{
|
||||
append(uint32_t(cp), std::back_inserter(s));
|
||||
}
|
||||
|
||||
inline std::string utf16to8(std::u16string_view s)
|
||||
{
|
||||
std::string result;
|
||||
utf16to8(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::u16string utf8to16(std::string_view s)
|
||||
{
|
||||
std::u16string result;
|
||||
utf8to16(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::string utf32to8(std::u32string_view s)
|
||||
{
|
||||
std::string result;
|
||||
utf32to8(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::u32string utf8to32(std::string_view s)
|
||||
{
|
||||
std::u32string result;
|
||||
utf8to32(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::size_t find_invalid(std::string_view s)
|
||||
{
|
||||
std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end());
|
||||
return (invalid == s.end()) ? std::string_view::npos : (invalid - s.begin());
|
||||
}
|
||||
|
||||
inline bool is_valid(std::string_view s)
|
||||
{
|
||||
return is_valid(s.begin(), s.end());
|
||||
}
|
||||
|
||||
inline std::string replace_invalid(std::string_view s, char32_t replacement)
|
||||
{
|
||||
std::string result;
|
||||
replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::string replace_invalid(std::string_view s)
|
||||
{
|
||||
std::string result;
|
||||
replace_invalid(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline bool starts_with_bom(std::string_view s)
|
||||
{
|
||||
return starts_with_bom(s.begin(), s.end());
|
||||
}
|
||||
|
||||
} // namespace utf8
|
||||
|
||||
#endif // header guard
|
||||
|
||||
274
mlu_370-piper/piper/src/cpp/utf8/unchecked.h
Normal file
274
mlu_370-piper/piper/src/cpp/utf8/unchecked.h
Normal file
@@ -0,0 +1,274 @@
|
||||
// Copyright 2006 Nemanja Trifunovic
|
||||
|
||||
/*
|
||||
Permission is hereby granted, free of charge, to any person or organization
|
||||
obtaining a copy of the software and accompanying documentation covered by
|
||||
this license (the "Software") to use, reproduce, display, distribute,
|
||||
execute, and transmit the Software, and to prepare derivative works of the
|
||||
Software, and to permit third-parties to whom the Software is furnished to
|
||||
do so, all subject to the following:
|
||||
|
||||
The copyright notices in the Software and this entire statement, including
|
||||
the above license grant, this restriction and the following disclaimer,
|
||||
must be included in all copies of the Software, in whole or in part, and
|
||||
all derivative works of the Software, unless such copies or derivative
|
||||
works are solely in the form of machine-executable object code generated by
|
||||
a source language processor.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||
#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||
|
||||
#include "core.h"
|
||||
|
||||
namespace utf8
|
||||
{
|
||||
namespace unchecked
|
||||
{
|
||||
template <typename octet_iterator>
|
||||
octet_iterator append(uint32_t cp, octet_iterator result)
|
||||
{
|
||||
if (cp < 0x80) // one octet
|
||||
*(result++) = static_cast<uint8_t>(cp);
|
||||
else if (cp < 0x800) { // two octets
|
||||
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
|
||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
}
|
||||
else if (cp < 0x10000) { // three octets
|
||||
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
||||
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
}
|
||||
else { // four octets
|
||||
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
||||
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
|
||||
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename output_iterator>
|
||||
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
|
||||
{
|
||||
while (start != end) {
|
||||
octet_iterator sequence_start = start;
|
||||
internal::utf_error err_code = utf8::internal::validate_next(start, end);
|
||||
switch (err_code) {
|
||||
case internal::UTF8_OK :
|
||||
for (octet_iterator it = sequence_start; it != start; ++it)
|
||||
*out++ = *it;
|
||||
break;
|
||||
case internal::NOT_ENOUGH_ROOM:
|
||||
out = utf8::unchecked::append (replacement, out);
|
||||
start = end;
|
||||
break;
|
||||
case internal::INVALID_LEAD:
|
||||
out = utf8::unchecked::append (replacement, out);
|
||||
++start;
|
||||
break;
|
||||
case internal::INCOMPLETE_SEQUENCE:
|
||||
case internal::OVERLONG_SEQUENCE:
|
||||
case internal::INVALID_CODE_POINT:
|
||||
out = utf8::unchecked::append (replacement, out);
|
||||
++start;
|
||||
// just one replacement mark for the sequence
|
||||
while (start != end && utf8::internal::is_trail(*start))
|
||||
++start;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename output_iterator>
|
||||
inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
|
||||
{
|
||||
static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
|
||||
return utf8::unchecked::replace_invalid(start, end, out, replacement_marker);
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
uint32_t next(octet_iterator& it)
|
||||
{
|
||||
uint32_t cp = utf8::internal::mask8(*it);
|
||||
typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
|
||||
switch (length) {
|
||||
case 1:
|
||||
break;
|
||||
case 2:
|
||||
it++;
|
||||
cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
|
||||
break;
|
||||
case 3:
|
||||
++it;
|
||||
cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
|
||||
++it;
|
||||
cp += (*it) & 0x3f;
|
||||
break;
|
||||
case 4:
|
||||
++it;
|
||||
cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
|
||||
++it;
|
||||
cp += (utf8::internal::mask8(*it) << 6) & 0xfff;
|
||||
++it;
|
||||
cp += (*it) & 0x3f;
|
||||
break;
|
||||
}
|
||||
++it;
|
||||
return cp;
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
uint32_t peek_next(octet_iterator it)
|
||||
{
|
||||
return utf8::unchecked::next(it);
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
uint32_t prior(octet_iterator& it)
|
||||
{
|
||||
while (utf8::internal::is_trail(*(--it))) ;
|
||||
octet_iterator temp = it;
|
||||
return utf8::unchecked::next(temp);
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename distance_type>
|
||||
void advance (octet_iterator& it, distance_type n)
|
||||
{
|
||||
const distance_type zero(0);
|
||||
if (n < zero) {
|
||||
// backward
|
||||
for (distance_type i = n; i < zero; ++i)
|
||||
utf8::unchecked::prior(it);
|
||||
} else {
|
||||
// forward
|
||||
for (distance_type i = zero; i < n; ++i)
|
||||
utf8::unchecked::next(it);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
typename std::iterator_traits<octet_iterator>::difference_type
|
||||
distance (octet_iterator first, octet_iterator last)
|
||||
{
|
||||
typename std::iterator_traits<octet_iterator>::difference_type dist;
|
||||
for (dist = 0; first < last; ++dist)
|
||||
utf8::unchecked::next(first);
|
||||
return dist;
|
||||
}
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||
{
|
||||
while (start != end) {
|
||||
uint32_t cp = utf8::internal::mask16(*start++);
|
||||
// Take care of surrogate pairs first
|
||||
if (utf8::internal::is_lead_surrogate(cp)) {
|
||||
uint32_t trail_surrogate = utf8::internal::mask16(*start++);
|
||||
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
|
||||
}
|
||||
result = utf8::unchecked::append(cp, result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
|
||||
{
|
||||
while (start < end) {
|
||||
uint32_t cp = utf8::unchecked::next(start);
|
||||
if (cp > 0xffff) { //make a surrogate pair
|
||||
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
|
||||
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
|
||||
}
|
||||
else
|
||||
*result++ = static_cast<uint16_t>(cp);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename u32bit_iterator>
|
||||
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
|
||||
{
|
||||
while (start != end)
|
||||
result = utf8::unchecked::append(*(start++), result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename u32bit_iterator>
|
||||
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
|
||||
{
|
||||
while (start < end)
|
||||
(*result++) = utf8::unchecked::next(start);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// The iterator class
|
||||
template <typename octet_iterator>
|
||||
class iterator {
|
||||
octet_iterator it;
|
||||
public:
|
||||
typedef uint32_t value_type;
|
||||
typedef uint32_t* pointer;
|
||||
typedef uint32_t& reference;
|
||||
typedef std::ptrdiff_t difference_type;
|
||||
typedef std::bidirectional_iterator_tag iterator_category;
|
||||
iterator () {}
|
||||
explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
|
||||
// the default "big three" are OK
|
||||
octet_iterator base () const { return it; }
|
||||
uint32_t operator * () const
|
||||
{
|
||||
octet_iterator temp = it;
|
||||
return utf8::unchecked::next(temp);
|
||||
}
|
||||
bool operator == (const iterator& rhs) const
|
||||
{
|
||||
return (it == rhs.it);
|
||||
}
|
||||
bool operator != (const iterator& rhs) const
|
||||
{
|
||||
return !(operator == (rhs));
|
||||
}
|
||||
iterator& operator ++ ()
|
||||
{
|
||||
::std::advance(it, utf8::internal::sequence_length(it));
|
||||
return *this;
|
||||
}
|
||||
iterator operator ++ (int)
|
||||
{
|
||||
iterator temp = *this;
|
||||
::std::advance(it, utf8::internal::sequence_length(it));
|
||||
return temp;
|
||||
}
|
||||
iterator& operator -- ()
|
||||
{
|
||||
utf8::unchecked::prior(it);
|
||||
return *this;
|
||||
}
|
||||
iterator operator -- (int)
|
||||
{
|
||||
iterator temp = *this;
|
||||
utf8::unchecked::prior(it);
|
||||
return temp;
|
||||
}
|
||||
}; // class iterator
|
||||
|
||||
} // namespace utf8::unchecked
|
||||
} // namespace utf8
|
||||
|
||||
|
||||
#endif // header guard
|
||||
|
||||
40
mlu_370-piper/piper/src/cpp/wavfile.hpp
Normal file
40
mlu_370-piper/piper/src/cpp/wavfile.hpp
Normal file
@@ -0,0 +1,40 @@
|
||||
#ifndef WAVFILE_H_
|
||||
#define WAVFILE_H_
|
||||
|
||||
#include <iostream>
|
||||
|
||||
struct WavHeader {
|
||||
uint8_t RIFF[4] = {'R', 'I', 'F', 'F'};
|
||||
uint32_t chunkSize;
|
||||
uint8_t WAVE[4] = {'W', 'A', 'V', 'E'};
|
||||
|
||||
// fmt
|
||||
uint8_t fmt[4] = {'f', 'm', 't', ' '};
|
||||
uint32_t fmtSize = 16; // bytes
|
||||
uint16_t audioFormat = 1; // PCM
|
||||
uint16_t numChannels; // mono
|
||||
uint32_t sampleRate; // Hertz
|
||||
uint32_t bytesPerSec; // sampleRate * sampleWidth
|
||||
uint16_t blockAlign = 2; // 16-bit mono
|
||||
uint16_t bitsPerSample = 16;
|
||||
|
||||
// data
|
||||
uint8_t data[4] = {'d', 'a', 't', 'a'};
|
||||
uint32_t dataSize;
|
||||
};
|
||||
|
||||
// Write WAV file header only
|
||||
void writeWavHeader(int sampleRate, int sampleWidth, int channels,
|
||||
uint32_t numSamples, std::ostream &audioFile) {
|
||||
WavHeader header;
|
||||
header.dataSize = numSamples * sampleWidth * channels;
|
||||
header.chunkSize = header.dataSize + sizeof(WavHeader) - 8;
|
||||
header.sampleRate = sampleRate;
|
||||
header.numChannels = channels;
|
||||
header.bytesPerSec = sampleRate * sampleWidth * channels;
|
||||
header.blockAlign = sampleWidth * channels;
|
||||
audioFile.write(reinterpret_cast<const char *>(&header), sizeof(header));
|
||||
|
||||
} /* writeWavHeader */
|
||||
|
||||
#endif // WAVFILE_H_
|
||||
1
mlu_370-piper/piper/src/python/.dockerignore
Normal file
1
mlu_370-piper/piper/src/python/.dockerignore
Normal file
@@ -0,0 +1 @@
|
||||
*
|
||||
6
mlu_370-piper/piper/src/python/Dockerfile
Normal file
6
mlu_370-piper/piper/src/python/Dockerfile
Normal file
@@ -0,0 +1,6 @@
|
||||
FROM nvcr.io/nvidia/pytorch:22.03-py3
|
||||
|
||||
RUN pip3 install \
|
||||
'pytorch-lightning~=1.7.0'
|
||||
|
||||
ENV NUMBA_CACHE_DIR=.numba_cache
|
||||
0
mlu_370-piper/piper/src/python/README.md
Normal file
0
mlu_370-piper/piper/src/python/README.md
Normal file
13
mlu_370-piper/piper/src/python/build_monotonic_align.sh
Executable file
13
mlu_370-piper/piper/src/python/build_monotonic_align.sh
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
set -eo pipefail
|
||||
|
||||
this_dir="$( cd "$( dirname "$0" )" && pwd )"
|
||||
|
||||
if [ -d "${this_dir}/.venv" ]; then
|
||||
source "${this_dir}/.venv/bin/activate"
|
||||
fi
|
||||
|
||||
cd "${this_dir}/piper_train/vits/monotonic_align"
|
||||
mkdir -p monotonic_align
|
||||
cythonize -i core.pyx
|
||||
mv core*.so monotonic_align/
|
||||
11
mlu_370-piper/piper/src/python/mypy.ini
Normal file
11
mlu_370-piper/piper/src/python/mypy.ini
Normal file
@@ -0,0 +1,11 @@
|
||||
|
||||
[mypy]
|
||||
|
||||
[mypy-setuptools.*]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-librosa.*]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-onnxruntime.*]
|
||||
ignore_missing_imports = True
|
||||
11
mlu_370-piper/piper/src/python/piper_train/.gitignore
vendored
Normal file
11
mlu_370-piper/piper/src/python/piper_train/.gitignore
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
.DS_Store
|
||||
.idea
|
||||
*.log
|
||||
tmp/
|
||||
|
||||
*.py[cod]
|
||||
*.egg
|
||||
build
|
||||
htmlcov
|
||||
|
||||
.venv/
|
||||
6
mlu_370-piper/piper/src/python/piper_train/.isort.cfg
Normal file
6
mlu_370-piper/piper/src/python/piper_train/.isort.cfg
Normal file
@@ -0,0 +1,6 @@
|
||||
[settings]
|
||||
multi_line_output=3
|
||||
include_trailing_comma=True
|
||||
force_grid_wrap=0
|
||||
use_parentheses=True
|
||||
line_length=88
|
||||
1
mlu_370-piper/piper/src/python/piper_train/VERSION
Normal file
1
mlu_370-piper/piper/src/python/piper_train/VERSION
Normal file
@@ -0,0 +1 @@
|
||||
1.0.0
|
||||
147
mlu_370-piper/piper/src/python/piper_train/__main__.py
Normal file
147
mlu_370-piper/piper/src/python/piper_train/__main__.py
Normal file
@@ -0,0 +1,147 @@
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from pytorch_lightning import Trainer
|
||||
from pytorch_lightning.callbacks import ModelCheckpoint
|
||||
|
||||
from .vits.lightning import VitsModel
|
||||
|
||||
_LOGGER = logging.getLogger(__package__)
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--dataset-dir", required=True, help="Path to pre-processed dataset directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--checkpoint-epochs",
|
||||
type=int,
|
||||
help="Save checkpoint every N epochs (default: 1)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--quality",
|
||||
default="medium",
|
||||
choices=("x-low", "medium", "high"),
|
||||
help="Quality/size of model (default: medium)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resume_from_single_speaker_checkpoint",
|
||||
help="For multi-speaker models only. Converts a single-speaker checkpoint to multi-speaker and resumes training",
|
||||
)
|
||||
Trainer.add_argparse_args(parser)
|
||||
VitsModel.add_model_specific_args(parser)
|
||||
parser.add_argument("--seed", type=int, default=1234)
|
||||
args = parser.parse_args()
|
||||
_LOGGER.debug(args)
|
||||
|
||||
args.dataset_dir = Path(args.dataset_dir)
|
||||
if not args.default_root_dir:
|
||||
args.default_root_dir = args.dataset_dir
|
||||
|
||||
torch.backends.cudnn.benchmark = True
|
||||
torch.manual_seed(args.seed)
|
||||
|
||||
config_path = args.dataset_dir / "config.json"
|
||||
dataset_path = args.dataset_dir / "dataset.jsonl"
|
||||
|
||||
with open(config_path, "r", encoding="utf-8") as config_file:
|
||||
# See preprocess.py for format
|
||||
config = json.load(config_file)
|
||||
num_symbols = int(config["num_symbols"])
|
||||
num_speakers = int(config["num_speakers"])
|
||||
sample_rate = int(config["audio"]["sample_rate"])
|
||||
|
||||
trainer = Trainer.from_argparse_args(args)
|
||||
if args.checkpoint_epochs is not None:
|
||||
trainer.callbacks = [ModelCheckpoint(every_n_epochs=args.checkpoint_epochs)]
|
||||
_LOGGER.debug(
|
||||
"Checkpoints will be saved every %s epoch(s)", args.checkpoint_epochs
|
||||
)
|
||||
|
||||
dict_args = vars(args)
|
||||
if args.quality == "x-low":
|
||||
dict_args["hidden_channels"] = 96
|
||||
dict_args["inter_channels"] = 96
|
||||
dict_args["filter_channels"] = 384
|
||||
elif args.quality == "high":
|
||||
dict_args["resblock"] = "1"
|
||||
dict_args["resblock_kernel_sizes"] = (3, 7, 11)
|
||||
dict_args["resblock_dilation_sizes"] = (
|
||||
(1, 3, 5),
|
||||
(1, 3, 5),
|
||||
(1, 3, 5),
|
||||
)
|
||||
dict_args["upsample_rates"] = (8, 8, 2, 2)
|
||||
dict_args["upsample_initial_channel"] = 512
|
||||
dict_args["upsample_kernel_sizes"] = (16, 16, 4, 4)
|
||||
|
||||
model = VitsModel(
|
||||
num_symbols=num_symbols,
|
||||
num_speakers=num_speakers,
|
||||
sample_rate=sample_rate,
|
||||
dataset=[dataset_path],
|
||||
**dict_args,
|
||||
)
|
||||
|
||||
if args.resume_from_single_speaker_checkpoint:
|
||||
assert (
|
||||
num_speakers > 1
|
||||
), "--resume_from_single_speaker_checkpoint is only for multi-speaker models. Use --resume_from_checkpoint for single-speaker models."
|
||||
|
||||
# Load single-speaker checkpoint
|
||||
_LOGGER.debug(
|
||||
"Resuming from single-speaker checkpoint: %s",
|
||||
args.resume_from_single_speaker_checkpoint,
|
||||
)
|
||||
model_single = VitsModel.load_from_checkpoint(
|
||||
args.resume_from_single_speaker_checkpoint,
|
||||
dataset=None,
|
||||
)
|
||||
g_dict = model_single.model_g.state_dict()
|
||||
for key in list(g_dict.keys()):
|
||||
# Remove keys that can't be copied over due to missing speaker embedding
|
||||
if (
|
||||
key.startswith("dec.cond")
|
||||
or key.startswith("dp.cond")
|
||||
or ("enc.cond_layer" in key)
|
||||
):
|
||||
g_dict.pop(key, None)
|
||||
|
||||
# Copy over the multi-speaker model, excluding keys related to the
|
||||
# speaker embedding (which is missing from the single-speaker model).
|
||||
load_state_dict(model.model_g, g_dict)
|
||||
load_state_dict(model.model_d, model_single.model_d.state_dict())
|
||||
_LOGGER.info(
|
||||
"Successfully converted single-speaker checkpoint to multi-speaker"
|
||||
)
|
||||
|
||||
trainer.fit(model)
|
||||
|
||||
|
||||
def load_state_dict(model, saved_state_dict):
|
||||
state_dict = model.state_dict()
|
||||
new_state_dict = {}
|
||||
|
||||
for k, v in state_dict.items():
|
||||
if k in saved_state_dict:
|
||||
# Use saved value
|
||||
new_state_dict[k] = saved_state_dict[k]
|
||||
else:
|
||||
# Use initialized value
|
||||
_LOGGER.debug("%s is not in the checkpoint", k)
|
||||
new_state_dict[k] = v
|
||||
|
||||
model.load_state_dict(new_state_dict)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
19
mlu_370-piper/piper/src/python/piper_train/_resources.py
Normal file
19
mlu_370-piper/piper/src/python/piper_train/_resources.py
Normal file
@@ -0,0 +1,19 @@
|
||||
"""Shared access to package resources"""
|
||||
import os
|
||||
import typing
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import importlib.resources
|
||||
|
||||
files = importlib.resources.files
|
||||
except (ImportError, AttributeError):
|
||||
# Backport for Python < 3.9
|
||||
import importlib_resources # type: ignore
|
||||
|
||||
files = importlib_resources.files
|
||||
|
||||
_PACKAGE = "piper_train"
|
||||
_DIR = Path(typing.cast(os.PathLike, files(_PACKAGE)))
|
||||
|
||||
__version__ = (_DIR / "VERSION").read_text(encoding="utf-8").strip()
|
||||
57
mlu_370-piper/piper/src/python/piper_train/check_phonemes.py
Normal file
57
mlu_370-piper/piper/src/python/piper_train/check_phonemes.py
Normal file
@@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import sys
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
|
||||
from .phonemize import DEFAULT_PHONEME_ID_MAP
|
||||
|
||||
|
||||
def main() -> None:
|
||||
used_phonemes: "Counter[str]" = Counter()
|
||||
missing_phonemes: "Counter[str]" = Counter()
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
utt = json.loads(line)
|
||||
for phoneme in utt["phonemes"]:
|
||||
used_phonemes[phoneme] += 1
|
||||
|
||||
if phoneme not in DEFAULT_PHONEME_ID_MAP:
|
||||
missing_phonemes[phoneme] += 1
|
||||
|
||||
if missing_phonemes:
|
||||
print("Missing", len(missing_phonemes), "phoneme(s)", file=sys.stderr)
|
||||
|
||||
json.dump(
|
||||
{
|
||||
"used": {
|
||||
phoneme: {
|
||||
"count": count,
|
||||
"hex": f"\\u{hex(ord(phoneme))}",
|
||||
"name": unicodedata.category(phoneme),
|
||||
"category": unicodedata.category(phoneme),
|
||||
}
|
||||
for phoneme, count in used_phonemes.most_common()
|
||||
},
|
||||
"missing": {
|
||||
phoneme: {
|
||||
"count": count,
|
||||
"hex": f"\\u{hex(ord(phoneme))}",
|
||||
"name": unicodedata.category(phoneme),
|
||||
"category": unicodedata.category(phoneme),
|
||||
}
|
||||
for phoneme, count in missing_phonemes.most_common()
|
||||
},
|
||||
},
|
||||
sys.stdout,
|
||||
)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
|
||||
_LOGGER = logging.getLogger()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--cache-dir",
|
||||
required=True,
|
||||
help="Path to directory with audio/spectrogram files (*.pt)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delete", action="store_true", help="Delete files that fail to load"
|
||||
)
|
||||
parser.add_argument("--debug", action="store_true")
|
||||
args = parser.parse_args()
|
||||
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
|
||||
_LOGGER.debug(args)
|
||||
|
||||
cache_dir = Path(args.cache_dir)
|
||||
num_deleted = 0
|
||||
|
||||
def check_file(pt_path: Path) -> None:
|
||||
nonlocal num_deleted
|
||||
|
||||
try:
|
||||
_LOGGER.debug("Checking %s", pt_path)
|
||||
torch.load(str(pt_path))
|
||||
except Exception:
|
||||
_LOGGER.error(pt_path)
|
||||
if args.delete:
|
||||
pt_path.unlink()
|
||||
num_deleted += 1
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
for pt_path in cache_dir.glob("*.pt"):
|
||||
executor.submit(check_file, pt_path)
|
||||
|
||||
print("Deleted:", num_deleted, "file(s)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
|
||||
from .vits.lightning import VitsModel
|
||||
|
||||
_LOGGER = logging.getLogger("piper_train.export_generator")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("checkpoint", help="Path to model checkpoint (.ckpt)")
|
||||
parser.add_argument("output", help="Path to output model (.pt)")
|
||||
|
||||
parser.add_argument(
|
||||
"--debug", action="store_true", help="Print DEBUG messages to the console"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.debug:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
_LOGGER.debug(args)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
args.checkpoint = Path(args.checkpoint)
|
||||
args.output = Path(args.output)
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
|
||||
model_g = model.model_g
|
||||
|
||||
# Inference only
|
||||
model_g.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
model_g.dec.remove_weight_norm()
|
||||
|
||||
model_g.forward = model_g.infer
|
||||
|
||||
torch.save(model_g, args.output)
|
||||
|
||||
_LOGGER.info("Exported model to %s", args.output)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
109
mlu_370-piper/piper/src/python/piper_train/export_onnx.py
Normal file
109
mlu_370-piper/piper/src/python/piper_train/export_onnx.py
Normal file
@@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from .vits.lightning import VitsModel
|
||||
|
||||
_LOGGER = logging.getLogger("piper_train.export_onnx")
|
||||
|
||||
OPSET_VERSION = 15
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point"""
|
||||
torch.manual_seed(1234)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("checkpoint", help="Path to model checkpoint (.ckpt)")
|
||||
parser.add_argument("output", help="Path to output model (.onnx)")
|
||||
|
||||
parser.add_argument(
|
||||
"--debug", action="store_true", help="Print DEBUG messages to the console"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.debug:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
_LOGGER.debug(args)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
args.checkpoint = Path(args.checkpoint)
|
||||
args.output = Path(args.output)
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
|
||||
model_g = model.model_g
|
||||
|
||||
num_symbols = model_g.n_vocab
|
||||
num_speakers = model_g.n_speakers
|
||||
|
||||
# Inference only
|
||||
model_g.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
model_g.dec.remove_weight_norm()
|
||||
|
||||
# old_forward = model_g.infer
|
||||
|
||||
def infer_forward(text, text_lengths, scales, sid=None):
|
||||
noise_scale = scales[0]
|
||||
length_scale = scales[1]
|
||||
noise_scale_w = scales[2]
|
||||
audio = model_g.infer(
|
||||
text,
|
||||
text_lengths,
|
||||
noise_scale=noise_scale,
|
||||
length_scale=length_scale,
|
||||
noise_scale_w=noise_scale_w,
|
||||
sid=sid,
|
||||
)[0].unsqueeze(1)
|
||||
|
||||
return audio
|
||||
|
||||
model_g.forward = infer_forward
|
||||
|
||||
dummy_input_length = 50
|
||||
sequences = torch.randint(
|
||||
low=0, high=num_symbols, size=(1, dummy_input_length), dtype=torch.long
|
||||
)
|
||||
sequence_lengths = torch.LongTensor([sequences.size(1)])
|
||||
|
||||
sid: Optional[torch.LongTensor] = None
|
||||
if num_speakers > 1:
|
||||
sid = torch.LongTensor([0])
|
||||
|
||||
# noise, noise_w, length
|
||||
scales = torch.FloatTensor([0.667, 1.0, 0.8])
|
||||
dummy_input = (sequences, sequence_lengths, scales, sid)
|
||||
|
||||
# Export
|
||||
torch.onnx.export(
|
||||
model=model_g,
|
||||
args=dummy_input,
|
||||
f=str(args.output),
|
||||
verbose=False,
|
||||
opset_version=OPSET_VERSION,
|
||||
input_names=["input", "input_lengths", "scales", "sid"],
|
||||
output_names=["output"],
|
||||
dynamic_axes={
|
||||
"input": {0: "batch_size", 1: "phonemes"},
|
||||
"input_lengths": {0: "batch_size"},
|
||||
"output": {0: "batch_size", 1: "time"},
|
||||
},
|
||||
)
|
||||
|
||||
_LOGGER.info("Exported model to %s", args.output)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,196 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from .vits import commons
|
||||
from .vits.lightning import VitsModel
|
||||
|
||||
_LOGGER = logging.getLogger("piper_train.export_onnx")
|
||||
OPSET_VERSION = 15
|
||||
|
||||
|
||||
class VitsEncoder(nn.Module):
|
||||
def __init__(self, gen):
|
||||
super().__init__()
|
||||
self.gen = gen
|
||||
|
||||
def forward(self, x, x_lengths, scales, sid=None):
|
||||
noise_scale = scales[0]
|
||||
length_scale = scales[1]
|
||||
noise_scale_w = scales[2]
|
||||
|
||||
gen = self.gen
|
||||
x, m_p, logs_p, x_mask = gen.enc_p(x, x_lengths)
|
||||
if gen.n_speakers > 1:
|
||||
assert sid is not None, "Missing speaker id"
|
||||
g = gen.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
||||
else:
|
||||
g = None
|
||||
|
||||
if gen.use_sdp:
|
||||
logw = gen.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
||||
else:
|
||||
logw = gen.dp(x, x_mask, g=g)
|
||||
w = torch.exp(logw) * x_mask * length_scale
|
||||
w_ceil = torch.ceil(w)
|
||||
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
|
||||
y_mask = torch.unsqueeze(
|
||||
commons.sequence_mask(y_lengths, y_lengths.max()), 1
|
||||
).type_as(x_mask)
|
||||
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
||||
attn = commons.generate_path(w_ceil, attn_mask)
|
||||
|
||||
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
|
||||
1, 2
|
||||
) # [b, t', t], [b, t, d] -> [b, d, t']
|
||||
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
|
||||
1, 2
|
||||
) # [b, t', t], [b, t, d] -> [b, d, t']
|
||||
|
||||
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
||||
return z_p, y_mask, g
|
||||
|
||||
|
||||
class VitsDecoder(nn.Module):
|
||||
def __init__(self, gen):
|
||||
super().__init__()
|
||||
self.gen = gen
|
||||
|
||||
def forward(self, z, y_mask, g=None):
|
||||
z = self.gen.flow(z, y_mask, g=g, reverse=True)
|
||||
output = self.gen.dec((z * y_mask), g=g)
|
||||
return output
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point"""
|
||||
torch.manual_seed(1234)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("checkpoint", help="Path to model checkpoint (.ckpt)")
|
||||
parser.add_argument("output_dir", help="Path to output directory")
|
||||
|
||||
parser.add_argument(
|
||||
"--debug", action="store_true", help="Print DEBUG messages to the console"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.debug:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
_LOGGER.debug(args)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
args.checkpoint = Path(args.checkpoint)
|
||||
args.output_dir = Path(args.output_dir)
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
|
||||
model_g = model.model_g
|
||||
|
||||
with torch.no_grad():
|
||||
model_g.dec.remove_weight_norm()
|
||||
|
||||
_LOGGER.info("Exporting encoder...")
|
||||
decoder_input = export_encoder(args, model_g)
|
||||
_LOGGER.info("Exporting decoder...")
|
||||
export_decoder(args, model_g, decoder_input)
|
||||
_LOGGER.info("Exported model to %s", str(args.output_dir))
|
||||
|
||||
|
||||
def export_encoder(args, model_g):
|
||||
model = VitsEncoder(model_g)
|
||||
model.eval()
|
||||
|
||||
num_symbols = model_g.n_vocab
|
||||
num_speakers = model_g.n_speakers
|
||||
|
||||
dummy_input_length = 50
|
||||
sequences = torch.randint(
|
||||
low=0, high=num_symbols, size=(1, dummy_input_length), dtype=torch.long
|
||||
)
|
||||
sequence_lengths = torch.LongTensor([sequences.size(1)])
|
||||
|
||||
sid: Optional[torch.LongTensor] = None
|
||||
if num_speakers > 1:
|
||||
sid = torch.LongTensor([0])
|
||||
|
||||
# noise, noise_w, length
|
||||
scales = torch.FloatTensor([0.667, 1.0, 0.8])
|
||||
dummy_input = (sequences, sequence_lengths, scales, sid)
|
||||
|
||||
output_names = [
|
||||
"z",
|
||||
"y_mask",
|
||||
]
|
||||
if model_g.n_speakers > 1:
|
||||
output_names.append("g")
|
||||
|
||||
onnx_path = os.fspath(args.output_dir.joinpath("encoder.onnx"))
|
||||
|
||||
# Export
|
||||
torch.onnx.export(
|
||||
model=model,
|
||||
args=dummy_input,
|
||||
f=onnx_path,
|
||||
verbose=False,
|
||||
opset_version=OPSET_VERSION,
|
||||
input_names=["input", "input_lengths", "scales", "sid"],
|
||||
output_names=output_names,
|
||||
dynamic_axes={
|
||||
"input": {0: "batch_size", 1: "phonemes"},
|
||||
"input_lengths": {0: "batch_size"},
|
||||
"output": {0: "batch_size", 2: "time"},
|
||||
},
|
||||
)
|
||||
_LOGGER.info("Exported encoder to %s", onnx_path)
|
||||
|
||||
return model(*dummy_input)
|
||||
|
||||
|
||||
def export_decoder(args, model_g, decoder_input):
|
||||
model = VitsDecoder(model_g)
|
||||
model.eval()
|
||||
|
||||
input_names = [
|
||||
"z",
|
||||
"y_mask",
|
||||
]
|
||||
if model_g.n_speakers > 1:
|
||||
input_names.append("g")
|
||||
|
||||
onnx_path = os.fspath(args.output_dir.joinpath("decoder.onnx"))
|
||||
|
||||
# Export
|
||||
torch.onnx.export(
|
||||
model=model,
|
||||
args=decoder_input,
|
||||
f=onnx_path,
|
||||
verbose=False,
|
||||
opset_version=OPSET_VERSION,
|
||||
input_names=input_names,
|
||||
output_names=["output"],
|
||||
dynamic_axes={
|
||||
"z": {0: "batch_size", 2: "time"},
|
||||
"y_mask": {0: "batch_size", 2: "time"},
|
||||
"output": {0: "batch_size", 1: "time"},
|
||||
},
|
||||
)
|
||||
|
||||
_LOGGER.info("Exported decoder to %s", onnx_path)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
|
||||
from .vits.lightning import VitsModel
|
||||
|
||||
_LOGGER = logging.getLogger("piper_train.export_torchscript")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point"""
|
||||
torch.manual_seed(1234)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("checkpoint", help="Path to model checkpoint (.ckpt)")
|
||||
parser.add_argument("output", help="Path to output model (.onnx)")
|
||||
|
||||
parser.add_argument(
|
||||
"--debug", action="store_true", help="Print DEBUG messages to the console"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.debug:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
_LOGGER.debug(args)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
args.checkpoint = Path(args.checkpoint)
|
||||
args.output = Path(args.output)
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
|
||||
model_g = model.model_g
|
||||
|
||||
num_symbols = model_g.n_vocab
|
||||
|
||||
# Inference only
|
||||
model_g.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
model_g.dec.remove_weight_norm()
|
||||
|
||||
model_g.forward = model_g.infer
|
||||
|
||||
dummy_input_length = 50
|
||||
sequences = torch.randint(
|
||||
low=0, high=num_symbols, size=(1, dummy_input_length), dtype=torch.long
|
||||
)
|
||||
sequence_lengths = torch.LongTensor([sequences.size(1)])
|
||||
|
||||
sid = torch.LongTensor([0])
|
||||
|
||||
dummy_input = (
|
||||
sequences,
|
||||
sequence_lengths,
|
||||
sid,
|
||||
torch.FloatTensor([0.667]),
|
||||
torch.FloatTensor([1.0]),
|
||||
torch.FloatTensor([0.8]),
|
||||
)
|
||||
|
||||
jitted_model = torch.jit.trace(model_g, dummy_input)
|
||||
torch.jit.save(jitted_model, str(args.output))
|
||||
|
||||
_LOGGER.info("Saved TorchScript model to %s", args.output)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
244
mlu_370-piper/piper/src/python/piper_train/filter_utterances.py
Normal file
244
mlu_370-piper/piper/src/python/piper_train/filter_utterances.py
Normal file
@@ -0,0 +1,244 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import statistics
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import asdict, dataclass
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .norm_audio import make_silence_detector, trim_silence
|
||||
|
||||
_DIR = Path(__file__).parent
|
||||
|
||||
# Removed from the speaking rate calculation
|
||||
_PUNCTUATION = re.compile(".。,,?¿?؟!!;;::-—")
|
||||
|
||||
|
||||
class ExcludeReason(str, Enum):
|
||||
MISSING = "file_missing"
|
||||
EMPTY = "file_empty"
|
||||
LOW = "rate_low"
|
||||
HIGH = "rate_high"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Utterance:
|
||||
id: str
|
||||
text: str
|
||||
duration_sec: float
|
||||
speaker: str
|
||||
exclude_reason: Optional[ExcludeReason] = None
|
||||
rate: float = 0.0
|
||||
|
||||
def __post_init__(self):
|
||||
if self.duration_sec > 0:
|
||||
# Don't include punctuation is speaking rate calculation since we
|
||||
# remove silence.
|
||||
text_nopunct = _PUNCTUATION.sub("", self.text)
|
||||
self.rate = len(text_nopunct) / self.duration_sec
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--write-json", help="Path to write information about excluded utterances"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset-dir", default=Path.cwd(), help="Path to dataset directory"
|
||||
)
|
||||
parser.add_argument("--scale-lower", type=float, default=2.0)
|
||||
parser.add_argument("--scale-upper", type=float, default=2.0)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not shutil.which("ffprobe"):
|
||||
raise RuntimeError("ffprobe not found (is ffmpeg installed?)")
|
||||
|
||||
dataset_dir = Path(args.dataset_dir)
|
||||
wav_dir = dataset_dir / "wav"
|
||||
if not wav_dir.is_dir():
|
||||
wav_dir = dataset_dir / "wavs"
|
||||
|
||||
reader = csv.reader(sys.stdin, delimiter="|")
|
||||
|
||||
text_and_audio = []
|
||||
for row in reader:
|
||||
filename, text = row[0], row[-1]
|
||||
speaker = row[1] if len(row) > 2 else "default"
|
||||
|
||||
# Try file name relative to metadata
|
||||
wav_path = dataset_dir / filename
|
||||
|
||||
if not wav_path.exists():
|
||||
# Try with .wav
|
||||
wav_path = dataset_dir / f"{filename}.wav"
|
||||
|
||||
if not wav_path.exists():
|
||||
# Try wav/ or wavs/
|
||||
wav_path = wav_dir / filename
|
||||
|
||||
if not wav_path.exists():
|
||||
# Try with .wav
|
||||
wav_path = wav_dir / f"{filename}.wav"
|
||||
|
||||
text_and_audio.append((filename, text, wav_path, speaker))
|
||||
|
||||
writer = csv.writer(sys.stdout, delimiter="|")
|
||||
|
||||
# speaker -> [rate]
|
||||
utts_by_speaker = defaultdict(list)
|
||||
process_utterance = ProcessUtterance()
|
||||
with ThreadPoolExecutor() as executor:
|
||||
for utt in executor.map(lambda args: process_utterance(*args), text_and_audio):
|
||||
utts_by_speaker[utt.speaker].append(utt)
|
||||
|
||||
is_multispeaker = len(utts_by_speaker) > 1
|
||||
writer = csv.writer(sys.stdout, delimiter="|")
|
||||
|
||||
speaker_details = {}
|
||||
for speaker, utts in utts_by_speaker.items():
|
||||
rates = [utt.rate for utt in utts]
|
||||
if rates:
|
||||
# Exclude rates well outside the 25%/75% quantiles
|
||||
rate_qs = statistics.quantiles(rates, n=4)
|
||||
q1 = rate_qs[0] # 25%
|
||||
q3 = rate_qs[-1] # 75%
|
||||
iqr = q3 - q1
|
||||
lower = q1 - (args.scale_lower * iqr)
|
||||
upper = q3 + (args.scale_upper * iqr)
|
||||
speaker_details[speaker] = {
|
||||
"min": min(rates),
|
||||
"max": max(rates),
|
||||
"quanties": rate_qs,
|
||||
"lower": lower,
|
||||
"upper": upper,
|
||||
}
|
||||
|
||||
for utt in utts:
|
||||
if utt.rate < lower:
|
||||
utt.exclude_reason = ExcludeReason.LOW
|
||||
elif utt.rate > upper:
|
||||
utt.exclude_reason = ExcludeReason.HIGH
|
||||
else:
|
||||
if is_multispeaker:
|
||||
writer.writerow((utt.id, utt.speaker, utt.text))
|
||||
else:
|
||||
writer.writerow((utt.id, utt.text))
|
||||
|
||||
if args.write_json:
|
||||
speaker_excluded = {
|
||||
speaker: [
|
||||
asdict(utt)
|
||||
for utt in utts_by_speaker[speaker]
|
||||
if utt.exclude_reason is not None
|
||||
]
|
||||
for speaker in speaker_details
|
||||
}
|
||||
|
||||
with open(args.write_json, "w", encoding="utf-8") as json_file:
|
||||
json.dump(
|
||||
{
|
||||
speaker: {
|
||||
"details": speaker_details[speaker],
|
||||
"num_utterances": len(utts_by_speaker[speaker]),
|
||||
"num_excluded": len(speaker_excluded[speaker]),
|
||||
"excluded": speaker_excluded[speaker],
|
||||
}
|
||||
for speaker in speaker_details
|
||||
},
|
||||
json_file,
|
||||
indent=4,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
|
||||
|
||||
class ProcessUtterance:
|
||||
def __init__(self):
|
||||
self.thread_data = threading.local()
|
||||
|
||||
def __call__(
|
||||
self, utt_id: str, text: str, wav_path: Path, speaker: str
|
||||
) -> Utterance:
|
||||
if not wav_path.exists():
|
||||
return Utterance(
|
||||
utt_id,
|
||||
text,
|
||||
0.0,
|
||||
speaker,
|
||||
exclude_reason=ExcludeReason.MISSING,
|
||||
)
|
||||
|
||||
if wav_path.stat().st_size == 0:
|
||||
return Utterance(
|
||||
utt_id,
|
||||
text,
|
||||
0.0,
|
||||
speaker,
|
||||
exclude_reason=ExcludeReason.EMPTY,
|
||||
)
|
||||
|
||||
return Utterance(utt_id, text, self.get_duration(wav_path), speaker)
|
||||
|
||||
def get_duration(self, audio_path: Path) -> float:
|
||||
"""Uses ffmpeg to get audio duration."""
|
||||
if not hasattr(self.thread_data, "detector"):
|
||||
self.thread_data.detector = make_silence_detector()
|
||||
|
||||
vad_sample_rate = 16000
|
||||
audio_16khz_bytes = subprocess.check_output(
|
||||
[
|
||||
"ffmpeg",
|
||||
"-i",
|
||||
str(audio_path),
|
||||
"-f",
|
||||
"s16le",
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ac",
|
||||
"1",
|
||||
"-ar",
|
||||
str(vad_sample_rate),
|
||||
"pipe:",
|
||||
],
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
|
||||
# Normalize
|
||||
audio_16khz = np.frombuffer(audio_16khz_bytes, dtype=np.int16).astype(
|
||||
np.float32
|
||||
)
|
||||
audio_16khz /= np.abs(np.max(audio_16khz))
|
||||
|
||||
# Get speaking duration
|
||||
offset_sec, duration_sec = trim_silence(
|
||||
audio_16khz,
|
||||
self.thread_data.detector,
|
||||
threshold=0.8,
|
||||
samples_per_chunk=480,
|
||||
sample_rate=vad_sample_rate,
|
||||
keep_chunks_before=2,
|
||||
keep_chunks_after=2,
|
||||
)
|
||||
|
||||
if duration_sec is None:
|
||||
# Speech goes to end of audio
|
||||
if len(audio_16khz) > 0:
|
||||
duration_sec = (len(audio_16khz) / 16000.0) - offset_sec
|
||||
else:
|
||||
duration_sec = 0.0
|
||||
|
||||
return duration_sec
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
84
mlu_370-piper/piper/src/python/piper_train/infer.py
Normal file
84
mlu_370-piper/piper/src/python/piper_train/infer.py
Normal file
@@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
|
||||
from .vits.lightning import VitsModel
|
||||
from .vits.utils import audio_float_to_int16
|
||||
from .vits.wavfile import write as write_wav
|
||||
|
||||
_LOGGER = logging.getLogger("piper_train.infer")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point"""
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
parser = argparse.ArgumentParser(prog="piper_train.infer")
|
||||
parser.add_argument(
|
||||
"--checkpoint", required=True, help="Path to model checkpoint (.ckpt)"
|
||||
)
|
||||
parser.add_argument("--output-dir", required=True, help="Path to write WAV files")
|
||||
parser.add_argument("--sample-rate", type=int, default=22050)
|
||||
#
|
||||
parser.add_argument("--noise-scale", type=float, default=0.667)
|
||||
parser.add_argument("--length-scale", type=float, default=1.0)
|
||||
parser.add_argument("--noise-w", type=float, default=0.8)
|
||||
#
|
||||
args = parser.parse_args()
|
||||
|
||||
args.output_dir = Path(args.output_dir)
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
|
||||
|
||||
# Inference only
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
model.model_g.dec.remove_weight_norm()
|
||||
|
||||
for i, line in enumerate(sys.stdin):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
utt = json.loads(line)
|
||||
utt_id = str(i)
|
||||
phoneme_ids = utt["phoneme_ids"]
|
||||
speaker_id = utt.get("speaker_id")
|
||||
|
||||
text = torch.LongTensor(phoneme_ids).unsqueeze(0)
|
||||
text_lengths = torch.LongTensor([len(phoneme_ids)])
|
||||
scales = [args.noise_scale, args.length_scale, args.noise_w]
|
||||
sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
|
||||
|
||||
start_time = time.perf_counter()
|
||||
audio = model(text, text_lengths, scales, sid=sid).detach().numpy()
|
||||
audio = audio_float_to_int16(audio)
|
||||
end_time = time.perf_counter()
|
||||
|
||||
audio_duration_sec = audio.shape[-1] / args.sample_rate
|
||||
infer_sec = end_time - start_time
|
||||
real_time_factor = (
|
||||
infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
|
||||
)
|
||||
|
||||
_LOGGER.debug(
|
||||
"Real-time factor for %s: %0.2f (infer=%0.2f sec, audio=%0.2f sec)",
|
||||
i + 1,
|
||||
real_time_factor,
|
||||
infer_sec,
|
||||
audio_duration_sec,
|
||||
)
|
||||
|
||||
output_path = args.output_dir / f"{utt_id}.wav"
|
||||
write_wav(str(output_path), args.sample_rate, audio)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
|
||||
from .vits.utils import audio_float_to_int16
|
||||
from .vits.wavfile import write as write_wav
|
||||
|
||||
_LOGGER = logging.getLogger("piper_train.infer_generator")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point"""
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
parser = argparse.ArgumentParser(prog="piper_train.infer_generator")
|
||||
parser.add_argument("--model", required=True, help="Path to generator (.pt)")
|
||||
parser.add_argument("--output-dir", required=True, help="Path to write WAV files")
|
||||
parser.add_argument("--sample-rate", type=int, default=22050)
|
||||
args = parser.parse_args()
|
||||
|
||||
args.output_dir = Path(args.output_dir)
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
model = torch.load(args.model)
|
||||
|
||||
# Inference only
|
||||
model.eval()
|
||||
|
||||
for i, line in enumerate(sys.stdin):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
utt = json.loads(line)
|
||||
utt_id = str(i)
|
||||
phoneme_ids = utt["phoneme_ids"]
|
||||
speaker_id = utt.get("speaker_id")
|
||||
|
||||
text = torch.LongTensor(phoneme_ids).unsqueeze(0)
|
||||
text_lengths = torch.LongTensor([len(phoneme_ids)])
|
||||
sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
|
||||
|
||||
start_time = time.perf_counter()
|
||||
audio = (
|
||||
model(
|
||||
text,
|
||||
text_lengths,
|
||||
sid,
|
||||
# torch.FloatTensor([0.667]),
|
||||
# torch.FloatTensor([1.0]),
|
||||
# torch.FloatTensor([0.8]),
|
||||
)[0]
|
||||
.detach()
|
||||
.numpy()
|
||||
)
|
||||
audio = audio_float_to_int16(audio)
|
||||
end_time = time.perf_counter()
|
||||
|
||||
audio_duration_sec = audio.shape[-1] / args.sample_rate
|
||||
infer_sec = end_time - start_time
|
||||
real_time_factor = (
|
||||
infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
|
||||
)
|
||||
|
||||
_LOGGER.debug(
|
||||
"Real-time factor for %s: %0.2f (infer=%0.2f sec, audio=%0.2f sec)",
|
||||
i + 1,
|
||||
real_time_factor,
|
||||
infer_sec,
|
||||
audio_duration_sec,
|
||||
)
|
||||
|
||||
output_path = args.output_dir / f"{utt_id}.wav"
|
||||
write_wav(str(output_path), args.sample_rate, audio)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
200
mlu_370-piper/piper/src/python/piper_train/infer_onnx.py
Normal file
200
mlu_370-piper/piper/src/python/piper_train/infer_onnx.py
Normal file
@@ -0,0 +1,200 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import onnxruntime
|
||||
|
||||
from .vits.utils import audio_float_to_int16
|
||||
from .vits.wavfile import write as write_wav
|
||||
|
||||
_LOGGER = logging.getLogger("piper_train.infer_onnx")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point"""
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
parser = argparse.ArgumentParser(prog="piper_train.infer_onnx")
|
||||
parser.add_argument("--model", required=True, help="Path to model (.onnx)")
|
||||
parser.add_argument("--output-dir", required=True, help="Path to write WAV files")
|
||||
parser.add_argument("--sample-rate", type=int, default=22050)
|
||||
parser.add_argument("--noise-scale", type=float, default=0.667)
|
||||
parser.add_argument("--noise-scale-w", type=float, default=0.8)
|
||||
parser.add_argument("--length-scale", type=float, default=1.0)
|
||||
args = parser.parse_args()
|
||||
|
||||
args.output_dir = Path(args.output_dir)
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
sess_options = onnxruntime.SessionOptions()
|
||||
_LOGGER.debug("Loading model from %s", args.model)
|
||||
model = onnxruntime.InferenceSession(str(args.model), sess_options=sess_options)
|
||||
_LOGGER.info("Loaded model from %s", args.model)
|
||||
|
||||
# text_empty = np.zeros((1, 300), dtype=np.int64)
|
||||
# text_lengths_empty = np.array([text_empty.shape[1]], dtype=np.int64)
|
||||
# scales = np.array(
|
||||
# [args.noise_scale, args.length_scale, args.noise_scale_w],
|
||||
# dtype=np.float32,
|
||||
# )
|
||||
# bias_audio = model.run(
|
||||
# None,
|
||||
# {"input": text_empty, "input_lengths": text_lengths_empty, "scales": scales},
|
||||
# )[0].squeeze((0, 1))
|
||||
# bias_spec, _ = transform(bias_audio)
|
||||
|
||||
for i, line in enumerate(sys.stdin):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
utt = json.loads(line)
|
||||
# utt_id = utt["id"]
|
||||
utt_id = str(i)
|
||||
phoneme_ids = utt["phoneme_ids"]
|
||||
speaker_id = utt.get("speaker_id")
|
||||
|
||||
text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
||||
text_lengths = np.array([text.shape[1]], dtype=np.int64)
|
||||
scales = np.array(
|
||||
[args.noise_scale, args.length_scale, args.noise_scale_w],
|
||||
dtype=np.float32,
|
||||
)
|
||||
sid = None
|
||||
|
||||
if speaker_id is not None:
|
||||
sid = np.array([speaker_id], dtype=np.int64)
|
||||
|
||||
start_time = time.perf_counter()
|
||||
audio = model.run(
|
||||
None,
|
||||
{
|
||||
"input": text,
|
||||
"input_lengths": text_lengths,
|
||||
"scales": scales,
|
||||
"sid": sid,
|
||||
},
|
||||
)[0].squeeze((0, 1))
|
||||
# audio = denoise(audio, bias_spec, 10)
|
||||
audio = audio_float_to_int16(audio.squeeze())
|
||||
end_time = time.perf_counter()
|
||||
|
||||
audio_duration_sec = audio.shape[-1] / args.sample_rate
|
||||
infer_sec = end_time - start_time
|
||||
real_time_factor = (
|
||||
infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
|
||||
)
|
||||
|
||||
_LOGGER.debug(
|
||||
"Real-time factor for %s: %0.2f (infer=%0.2f sec, audio=%0.2f sec)",
|
||||
i + 1,
|
||||
real_time_factor,
|
||||
infer_sec,
|
||||
audio_duration_sec,
|
||||
)
|
||||
|
||||
output_path = args.output_dir / f"{utt_id}.wav"
|
||||
write_wav(str(output_path), args.sample_rate, audio)
|
||||
|
||||
|
||||
def denoise(
|
||||
audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float
|
||||
) -> np.ndarray:
|
||||
audio_spec, audio_angles = transform(audio)
|
||||
|
||||
a = bias_spec.shape[-1]
|
||||
b = audio_spec.shape[-1]
|
||||
repeats = max(1, math.ceil(b / a))
|
||||
bias_spec_repeat = np.repeat(bias_spec, repeats, axis=-1)[..., :b]
|
||||
|
||||
audio_spec_denoised = audio_spec - (bias_spec_repeat * denoiser_strength)
|
||||
audio_spec_denoised = np.clip(audio_spec_denoised, a_min=0.0, a_max=None)
|
||||
audio_denoised = inverse(audio_spec_denoised, audio_angles)
|
||||
|
||||
return audio_denoised
|
||||
|
||||
|
||||
def stft(x, fft_size, hopsamp):
|
||||
"""Compute and return the STFT of the supplied time domain signal x.
|
||||
Args:
|
||||
x (1-dim Numpy array): A time domain signal.
|
||||
fft_size (int): FFT size. Should be a power of 2, otherwise DFT will be used.
|
||||
hopsamp (int):
|
||||
Returns:
|
||||
The STFT. The rows are the time slices and columns are the frequency bins.
|
||||
"""
|
||||
window = np.hanning(fft_size)
|
||||
fft_size = int(fft_size)
|
||||
hopsamp = int(hopsamp)
|
||||
return np.array(
|
||||
[
|
||||
np.fft.rfft(window * x[i : i + fft_size])
|
||||
for i in range(0, len(x) - fft_size, hopsamp)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def istft(X, fft_size, hopsamp):
|
||||
"""Invert a STFT into a time domain signal.
|
||||
Args:
|
||||
X (2-dim Numpy array): Input spectrogram. The rows are the time slices and columns are the frequency bins.
|
||||
fft_size (int):
|
||||
hopsamp (int): The hop size, in samples.
|
||||
Returns:
|
||||
The inverse STFT.
|
||||
"""
|
||||
fft_size = int(fft_size)
|
||||
hopsamp = int(hopsamp)
|
||||
window = np.hanning(fft_size)
|
||||
time_slices = X.shape[0]
|
||||
len_samples = int(time_slices * hopsamp + fft_size)
|
||||
x = np.zeros(len_samples)
|
||||
for n, i in enumerate(range(0, len(x) - fft_size, hopsamp)):
|
||||
x[i : i + fft_size] += window * np.real(np.fft.irfft(X[n]))
|
||||
return x
|
||||
|
||||
|
||||
def inverse(magnitude, phase):
|
||||
recombine_magnitude_phase = np.concatenate(
|
||||
[magnitude * np.cos(phase), magnitude * np.sin(phase)], axis=1
|
||||
)
|
||||
|
||||
x_org = recombine_magnitude_phase
|
||||
n_b, n_f, n_t = x_org.shape # pylint: disable=unpacking-non-sequence
|
||||
x = np.empty([n_b, n_f // 2, n_t], dtype=np.complex64)
|
||||
x.real = x_org[:, : n_f // 2]
|
||||
x.imag = x_org[:, n_f // 2 :]
|
||||
inverse_transform = []
|
||||
for y in x:
|
||||
y_ = istft(y.T, fft_size=1024, hopsamp=256)
|
||||
inverse_transform.append(y_[None, :])
|
||||
|
||||
inverse_transform = np.concatenate(inverse_transform, 0)
|
||||
|
||||
return inverse_transform
|
||||
|
||||
|
||||
def transform(input_data):
|
||||
x = input_data
|
||||
real_part = []
|
||||
imag_part = []
|
||||
for y in x:
|
||||
y_ = stft(y, fft_size=1024, hopsamp=256).T
|
||||
real_part.append(y_.real[None, :, :]) # pylint: disable=unsubscriptable-object
|
||||
imag_part.append(y_.imag[None, :, :]) # pylint: disable=unsubscriptable-object
|
||||
real_part = np.concatenate(real_part, 0)
|
||||
imag_part = np.concatenate(imag_part, 0)
|
||||
|
||||
magnitude = np.sqrt(real_part**2 + imag_part**2)
|
||||
phase = np.arctan2(imag_part.data, real_part.data)
|
||||
|
||||
return magnitude, phase
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,295 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import onnxruntime
|
||||
|
||||
from .vits.utils import audio_float_to_int16
|
||||
|
||||
_LOGGER = logging.getLogger("piper_train.infer_onnx")
|
||||
|
||||
|
||||
class SpeechStreamer:
|
||||
"""
|
||||
Stream speech in real time.
|
||||
|
||||
Args:
|
||||
encoder_path: path to encoder ONNX model
|
||||
decoder_path: path to decoder ONNX model
|
||||
sample_rate: output sample rate
|
||||
chunk_size: number of mel frames to decode in each steps (time in secs = chunk_size * 256)
|
||||
chunk_padding: number of mel frames to be concatinated to the start and end of the current chunk to reduce decoding artifacts
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
encoder_path,
|
||||
decoder_path,
|
||||
sample_rate,
|
||||
chunk_size=45,
|
||||
chunk_padding=10,
|
||||
):
|
||||
sess_options = onnxruntime.SessionOptions()
|
||||
_LOGGER.debug("Loading encoder model from %s", encoder_path)
|
||||
self.encoder = onnxruntime.InferenceSession(
|
||||
encoder_path, sess_options=sess_options
|
||||
)
|
||||
_LOGGER.debug("Loading decoder model from %s", decoder_path)
|
||||
self.decoder = onnxruntime.InferenceSession(
|
||||
decoder_path, sess_options=sess_options
|
||||
)
|
||||
|
||||
self.sample_rate = sample_rate
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_padding = chunk_padding
|
||||
|
||||
def encoder_infer(self, enc_input):
|
||||
ENC_START = time.perf_counter()
|
||||
enc_output = self.encoder.run(None, enc_input)
|
||||
ENC_INFER = time.perf_counter() - ENC_START
|
||||
_LOGGER.debug(f"Encoder inference {round(ENC_INFER * 1000)}")
|
||||
wav_length = enc_output[0].shape[2] * 256
|
||||
enc_rtf = round(ENC_INFER / (wav_length / self.sample_rate), 2)
|
||||
_LOGGER.debug(f"Encoder RTF {enc_rtf}")
|
||||
return enc_output
|
||||
|
||||
def decoder_infer(self, z, y_mask, g=None):
|
||||
dec_input = {"z": z, "y_mask": y_mask}
|
||||
if g:
|
||||
dec_input["g"] = g
|
||||
DEC_START = time.perf_counter()
|
||||
audio = self.decoder.run(None, dec_input)[0].squeeze()
|
||||
DEC_INFER = time.perf_counter() - DEC_START
|
||||
_LOGGER.debug(f"Decoder inference {round(DEC_INFER * 1000)}")
|
||||
dec_rtf = round(DEC_INFER / (len(audio) / self.sample_rate), 2)
|
||||
_LOGGER.debug(f"Decoder RTF {dec_rtf}")
|
||||
return audio
|
||||
|
||||
def chunk(self, enc_output):
|
||||
z, y_mask, *dec_args = enc_output
|
||||
n_frames = z.shape[2]
|
||||
if n_frames <= (self.chunk_size + (2 * self.chunk_padding)):
|
||||
# Too short to stream
|
||||
return self.decoder_infer(z, y_mask, *dec_args)
|
||||
split_at = [
|
||||
i * self.chunk_size for i in range(1, math.ceil(n_frames / self.chunk_size))
|
||||
]
|
||||
chunks = list(
|
||||
zip(
|
||||
np.split(z, split_at, axis=2),
|
||||
np.split(y_mask, split_at, axis=2),
|
||||
)
|
||||
)
|
||||
wav_start_pad = wav_end_pad = None
|
||||
for idx, (z_chunk, y_mask_chunk) in enumerate(chunks):
|
||||
if idx > 0:
|
||||
prev_z, prev_y_mask = chunks[idx - 1]
|
||||
start_zpad = prev_z[:, :, -self.chunk_padding :]
|
||||
start_ypad = prev_y_mask[:, :, -self.chunk_padding :]
|
||||
z_chunk = np.concatenate([start_zpad, z_chunk], axis=2)
|
||||
y_mask_chunk = np.concatenate([start_ypad, y_mask_chunk], axis=2)
|
||||
wav_start_pad = start_zpad.shape[2] * 256
|
||||
if (idx + 1) < len(chunks):
|
||||
next_z, next_y_mask = chunks[idx + 1]
|
||||
end_zpad = next_z[:, :, : self.chunk_padding]
|
||||
end_ypad = next_y_mask[:, :, : self.chunk_padding]
|
||||
z_chunk = np.concatenate([z_chunk, end_zpad], axis=2)
|
||||
y_mask_chunk = np.concatenate([y_mask_chunk, end_ypad], axis=2)
|
||||
wav_end_pad = end_zpad.shape[2] * 256
|
||||
audio = self.decoder_infer(z_chunk, y_mask_chunk, *dec_args)
|
||||
yield audio[wav_start_pad:-wav_end_pad]
|
||||
|
||||
def stream(self, encoder_input):
|
||||
start_time = time.perf_counter()
|
||||
has_shown_latency = False
|
||||
_LOGGER.debug("Starting synthesis")
|
||||
enc_output = self.encoder_infer(encoder_input)
|
||||
for wav in self.chunk(enc_output):
|
||||
if len(wav) == 0:
|
||||
continue
|
||||
if not has_shown_latency:
|
||||
LATENCY = round((time.perf_counter() - start_time) * 1000)
|
||||
_LOGGER.debug(f"Latency {LATENCY}")
|
||||
has_shown_latency = True
|
||||
audio = audio_float_to_int16(wav)
|
||||
yield audio.tobytes()
|
||||
_LOGGER.debug("Synthesis done!")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point"""
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
parser = argparse.ArgumentParser(prog="piper_train.infer_onnx_streaming")
|
||||
parser.add_argument(
|
||||
"--encoder", required=True, help="Path to encoder model (.onnx)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--decoder", required=True, help="Path to decoder model (.onnx)"
|
||||
)
|
||||
parser.add_argument("--sample-rate", type=int, default=22050)
|
||||
parser.add_argument("--noise-scale", type=float, default=0.667)
|
||||
parser.add_argument("--noise-scale-w", type=float, default=0.8)
|
||||
parser.add_argument("--length-scale", type=float, default=1.0)
|
||||
parser.add_argument(
|
||||
"--chunk-size",
|
||||
type=int,
|
||||
default=45,
|
||||
help="Number of mel frames to decode at each step"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chunk-padding",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Number of mel frames to add to the start and end of the current chunk to reduce decoding artifacts"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
streamer = SpeechStreamer(
|
||||
encoder_path=os.fspath(args.encoder),
|
||||
decoder_path=os.fspath(args.decoder),
|
||||
sample_rate=args.sample_rate,
|
||||
chunk_size=args.chunk_size,
|
||||
chunk_padding=args.chunk_padding,
|
||||
)
|
||||
|
||||
output_buffer = sys.stdout.buffer
|
||||
|
||||
for i, line in enumerate(sys.stdin):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
utt = json.loads(line)
|
||||
utt_id = str(i)
|
||||
phoneme_ids = utt["phoneme_ids"]
|
||||
speaker_id = utt.get("speaker_id")
|
||||
|
||||
text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
||||
text_lengths = np.array([text.shape[1]], dtype=np.int64)
|
||||
scales = np.array(
|
||||
[args.noise_scale, args.length_scale, args.noise_scale_w],
|
||||
dtype=np.float32,
|
||||
)
|
||||
sid = None
|
||||
|
||||
if speaker_id is not None:
|
||||
sid = np.array([speaker_id], dtype=np.int64)
|
||||
|
||||
stream = streamer.stream(
|
||||
{
|
||||
"input": text,
|
||||
"input_lengths": text_lengths,
|
||||
"scales": scales,
|
||||
"sid": sid,
|
||||
}
|
||||
)
|
||||
for wav_chunk in stream:
|
||||
output_buffer.write(wav_chunk)
|
||||
output_buffer.flush()
|
||||
|
||||
|
||||
def denoise(
|
||||
audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float
|
||||
) -> np.ndarray:
|
||||
audio_spec, audio_angles = transform(audio)
|
||||
|
||||
a = bias_spec.shape[-1]
|
||||
b = audio_spec.shape[-1]
|
||||
repeats = max(1, math.ceil(b / a))
|
||||
bias_spec_repeat = np.repeat(bias_spec, repeats, axis=-1)[..., :b]
|
||||
|
||||
audio_spec_denoised = audio_spec - (bias_spec_repeat * denoiser_strength)
|
||||
audio_spec_denoised = np.clip(audio_spec_denoised, a_min=0.0, a_max=None)
|
||||
audio_denoised = inverse(audio_spec_denoised, audio_angles)
|
||||
|
||||
return audio_denoised
|
||||
|
||||
|
||||
def stft(x, fft_size, hopsamp):
|
||||
"""Compute and return the STFT of the supplied time domain signal x.
|
||||
Args:
|
||||
x (1-dim Numpy array): A time domain signal.
|
||||
fft_size (int): FFT size. Should be a power of 2, otherwise DFT will be used.
|
||||
hopsamp (int):
|
||||
Returns:
|
||||
The STFT. The rows are the time slices and columns are the frequency bins.
|
||||
"""
|
||||
window = np.hanning(fft_size)
|
||||
fft_size = int(fft_size)
|
||||
hopsamp = int(hopsamp)
|
||||
return np.array(
|
||||
[
|
||||
np.fft.rfft(window * x[i : i + fft_size])
|
||||
for i in range(0, len(x) - fft_size, hopsamp)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def istft(X, fft_size, hopsamp):
|
||||
"""Invert a STFT into a time domain signal.
|
||||
Args:
|
||||
X (2-dim Numpy array): Input spectrogram. The rows are the time slices and columns are the frequency bins.
|
||||
fft_size (int):
|
||||
hopsamp (int): The hop size, in samples.
|
||||
Returns:
|
||||
The inverse STFT.
|
||||
"""
|
||||
fft_size = int(fft_size)
|
||||
hopsamp = int(hopsamp)
|
||||
window = np.hanning(fft_size)
|
||||
time_slices = X.shape[0]
|
||||
len_samples = int(time_slices * hopsamp + fft_size)
|
||||
x = np.zeros(len_samples)
|
||||
for n, i in enumerate(range(0, len(x) - fft_size, hopsamp)):
|
||||
x[i : i + fft_size] += window * np.real(np.fft.irfft(X[n]))
|
||||
return x
|
||||
|
||||
|
||||
def inverse(magnitude, phase):
|
||||
recombine_magnitude_phase = np.concatenate(
|
||||
[magnitude * np.cos(phase), magnitude * np.sin(phase)], axis=1
|
||||
)
|
||||
|
||||
x_org = recombine_magnitude_phase
|
||||
n_b, n_f, n_t = x_org.shape # pylint: disable=unpacking-non-sequence
|
||||
x = np.empty([n_b, n_f // 2, n_t], dtype=np.complex64)
|
||||
x.real = x_org[:, : n_f // 2]
|
||||
x.imag = x_org[:, n_f // 2 :]
|
||||
inverse_transform = []
|
||||
for y in x:
|
||||
y_ = istft(y.T, fft_size=1024, hopsamp=256)
|
||||
inverse_transform.append(y_[None, :])
|
||||
|
||||
inverse_transform = np.concatenate(inverse_transform, 0)
|
||||
|
||||
return inverse_transform
|
||||
|
||||
|
||||
def transform(input_data):
|
||||
x = input_data
|
||||
real_part = []
|
||||
imag_part = []
|
||||
for y in x:
|
||||
y_ = stft(y, fft_size=1024, hopsamp=256).T
|
||||
real_part.append(y_.real[None, :, :]) # pylint: disable=unsubscriptable-object
|
||||
imag_part.append(y_.imag[None, :, :]) # pylint: disable=unsubscriptable-object
|
||||
real_part = np.concatenate(real_part, 0)
|
||||
imag_part = np.concatenate(imag_part, 0)
|
||||
|
||||
magnitude = np.sqrt(real_part**2 + imag_part**2)
|
||||
phase = np.arctan2(imag_part.data, real_part.data)
|
||||
|
||||
return magnitude, phase
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
85
mlu_370-piper/piper/src/python/piper_train/infer_torchscript.py
Executable file
85
mlu_370-piper/piper/src/python/piper_train/infer_torchscript.py
Executable file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
|
||||
from .vits.utils import audio_float_to_int16
|
||||
from .vits.wavfile import write as write_wav
|
||||
|
||||
_LOGGER = logging.getLogger("piper_train.infer_torchscript")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point"""
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
parser = argparse.ArgumentParser(prog="piper_train.infer_torchscript")
|
||||
parser.add_argument(
|
||||
"--model", required=True, help="Path to torchscript checkpoint (.ts)"
|
||||
)
|
||||
parser.add_argument("--output-dir", required=True, help="Path to write WAV files")
|
||||
parser.add_argument("--sample-rate", type=int, default=22050)
|
||||
args = parser.parse_args()
|
||||
|
||||
args.output_dir = Path(args.output_dir)
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
model = torch.jit.load(args.model)
|
||||
|
||||
# Inference only
|
||||
model.eval()
|
||||
|
||||
for i, line in enumerate(sys.stdin):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
utt = json.loads(line)
|
||||
utt_id = str(i)
|
||||
phoneme_ids = utt["phoneme_ids"]
|
||||
speaker_id = utt.get("speaker_id")
|
||||
|
||||
text = torch.LongTensor(phoneme_ids).unsqueeze(0)
|
||||
text_lengths = torch.LongTensor([len(phoneme_ids)])
|
||||
sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
|
||||
|
||||
start_time = time.perf_counter()
|
||||
audio = (
|
||||
model(
|
||||
text,
|
||||
text_lengths,
|
||||
sid,
|
||||
torch.FloatTensor([0.667]),
|
||||
torch.FloatTensor([1.0]),
|
||||
torch.FloatTensor([0.8]),
|
||||
)[0]
|
||||
.detach()
|
||||
.numpy()
|
||||
)
|
||||
audio = audio_float_to_int16(audio)
|
||||
end_time = time.perf_counter()
|
||||
|
||||
audio_duration_sec = audio.shape[-1] / args.sample_rate
|
||||
infer_sec = end_time - start_time
|
||||
real_time_factor = (
|
||||
infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
|
||||
)
|
||||
|
||||
_LOGGER.debug(
|
||||
"Real-time factor for %s: %0.2f (infer=%0.2f sec, audio=%0.2f sec)",
|
||||
i + 1,
|
||||
real_time_factor,
|
||||
infer_sec,
|
||||
audio_duration_sec,
|
||||
)
|
||||
|
||||
output_path = args.output_dir / f"{utt_id}.wav"
|
||||
write_wav(str(output_path), args.sample_rate, audio)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,92 @@
|
||||
from hashlib import sha256
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import librosa
|
||||
import torch
|
||||
|
||||
from piper_train.vits.mel_processing import spectrogram_torch
|
||||
|
||||
from .trim import trim_silence
|
||||
from .vad import SileroVoiceActivityDetector
|
||||
|
||||
_DIR = Path(__file__).parent
|
||||
|
||||
|
||||
def make_silence_detector() -> SileroVoiceActivityDetector:
|
||||
silence_model = _DIR / "models" / "silero_vad.onnx"
|
||||
return SileroVoiceActivityDetector(silence_model)
|
||||
|
||||
|
||||
def cache_norm_audio(
|
||||
audio_path: Union[str, Path],
|
||||
cache_dir: Union[str, Path],
|
||||
detector: SileroVoiceActivityDetector,
|
||||
sample_rate: int,
|
||||
silence_threshold: float = 0.2,
|
||||
silence_samples_per_chunk: int = 480,
|
||||
silence_keep_chunks_before: int = 2,
|
||||
silence_keep_chunks_after: int = 2,
|
||||
filter_length: int = 1024,
|
||||
window_length: int = 1024,
|
||||
hop_length: int = 256,
|
||||
ignore_cache: bool = False,
|
||||
) -> Tuple[Path, Path]:
|
||||
audio_path = Path(audio_path).absolute()
|
||||
cache_dir = Path(cache_dir)
|
||||
|
||||
# Cache id is the SHA256 of the full audio path
|
||||
audio_cache_id = sha256(str(audio_path).encode()).hexdigest()
|
||||
|
||||
audio_norm_path = cache_dir / f"{audio_cache_id}.pt"
|
||||
audio_spec_path = cache_dir / f"{audio_cache_id}.spec.pt"
|
||||
|
||||
# Normalize audio
|
||||
audio_norm_tensor: Optional[torch.FloatTensor] = None
|
||||
if ignore_cache or (not audio_norm_path.exists()):
|
||||
# Trim silence first.
|
||||
#
|
||||
# The VAD model works on 16khz, so we determine the portion of audio
|
||||
# to keep and then just load that with librosa.
|
||||
vad_sample_rate = 16000
|
||||
audio_16khz, _sr = librosa.load(path=audio_path, sr=vad_sample_rate)
|
||||
|
||||
offset_sec, duration_sec = trim_silence(
|
||||
audio_16khz,
|
||||
detector,
|
||||
threshold=silence_threshold,
|
||||
samples_per_chunk=silence_samples_per_chunk,
|
||||
sample_rate=vad_sample_rate,
|
||||
keep_chunks_before=silence_keep_chunks_before,
|
||||
keep_chunks_after=silence_keep_chunks_after,
|
||||
)
|
||||
|
||||
# NOTE: audio is already in [-1, 1] coming from librosa
|
||||
audio_norm_array, _sr = librosa.load(
|
||||
path=audio_path,
|
||||
sr=sample_rate,
|
||||
offset=offset_sec,
|
||||
duration=duration_sec,
|
||||
)
|
||||
|
||||
# Save to cache directory
|
||||
audio_norm_tensor = torch.FloatTensor(audio_norm_array).unsqueeze(0)
|
||||
torch.save(audio_norm_tensor, audio_norm_path)
|
||||
|
||||
# Compute spectrogram
|
||||
if ignore_cache or (not audio_spec_path.exists()):
|
||||
if audio_norm_tensor is None:
|
||||
# Load pre-cached normalized audio
|
||||
audio_norm_tensor = torch.load(audio_norm_path)
|
||||
|
||||
audio_spec_tensor = spectrogram_torch(
|
||||
y=audio_norm_tensor,
|
||||
n_fft=filter_length,
|
||||
sampling_rate=sample_rate,
|
||||
hop_size=hop_length,
|
||||
win_size=window_length,
|
||||
center=False,
|
||||
).squeeze(0)
|
||||
torch.save(audio_spec_tensor, audio_spec_path)
|
||||
|
||||
return audio_norm_path, audio_spec_path
|
||||
Binary file not shown.
@@ -0,0 +1,54 @@
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .vad import SileroVoiceActivityDetector
|
||||
|
||||
|
||||
def trim_silence(
|
||||
audio_array: np.ndarray,
|
||||
detector: SileroVoiceActivityDetector,
|
||||
threshold: float = 0.2,
|
||||
samples_per_chunk=480,
|
||||
sample_rate=16000,
|
||||
keep_chunks_before: int = 2,
|
||||
keep_chunks_after: int = 2,
|
||||
) -> Tuple[float, Optional[float]]:
|
||||
"""Returns the offset/duration of trimmed audio in seconds"""
|
||||
offset_sec: float = 0.0
|
||||
duration_sec: Optional[float] = None
|
||||
first_chunk: Optional[int] = None
|
||||
last_chunk: Optional[int] = None
|
||||
seconds_per_chunk: float = samples_per_chunk / sample_rate
|
||||
|
||||
chunk = audio_array[:samples_per_chunk]
|
||||
audio_array = audio_array[samples_per_chunk:]
|
||||
chunk_idx: int = 0
|
||||
|
||||
# Determine main block of speech
|
||||
while len(audio_array) > 0:
|
||||
prob = detector(chunk, sample_rate=sample_rate)
|
||||
is_speech = prob >= threshold
|
||||
|
||||
if is_speech:
|
||||
if first_chunk is None:
|
||||
# First speech
|
||||
first_chunk = chunk_idx
|
||||
else:
|
||||
# Last speech so far
|
||||
last_chunk = chunk_idx
|
||||
|
||||
chunk = audio_array[:samples_per_chunk]
|
||||
audio_array = audio_array[samples_per_chunk:]
|
||||
chunk_idx += 1
|
||||
|
||||
if (first_chunk is not None) and (last_chunk is not None):
|
||||
first_chunk = max(0, first_chunk - keep_chunks_before)
|
||||
last_chunk = min(chunk_idx, last_chunk + keep_chunks_after)
|
||||
|
||||
# Compute offset/duration
|
||||
offset_sec = first_chunk * seconds_per_chunk
|
||||
last_sec = (last_chunk + 1) * seconds_per_chunk
|
||||
duration_sec = last_sec - offset_sec
|
||||
|
||||
return offset_sec, duration_sec
|
||||
54
mlu_370-piper/piper/src/python/piper_train/norm_audio/vad.py
Normal file
54
mlu_370-piper/piper/src/python/piper_train/norm_audio/vad.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import typing
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import onnxruntime
|
||||
|
||||
|
||||
class SileroVoiceActivityDetector:
|
||||
"""Detects speech/silence using Silero VAD.
|
||||
|
||||
https://github.com/snakers4/silero-vad
|
||||
"""
|
||||
|
||||
def __init__(self, onnx_path: typing.Union[str, Path]):
|
||||
onnx_path = str(onnx_path)
|
||||
|
||||
self.session = onnxruntime.InferenceSession(onnx_path)
|
||||
self.session.intra_op_num_threads = 1
|
||||
self.session.inter_op_num_threads = 1
|
||||
|
||||
self._h = np.zeros((2, 1, 64)).astype("float32")
|
||||
self._c = np.zeros((2, 1, 64)).astype("float32")
|
||||
|
||||
def __call__(self, audio_array: np.ndarray, sample_rate: int = 16000):
|
||||
"""Return probability of speech in audio [0-1].
|
||||
|
||||
Audio must be 16Khz 16-bit mono PCM.
|
||||
"""
|
||||
if len(audio_array.shape) == 1:
|
||||
# Add batch dimension
|
||||
audio_array = np.expand_dims(audio_array, 0)
|
||||
|
||||
if len(audio_array.shape) > 2:
|
||||
raise ValueError(
|
||||
f"Too many dimensions for input audio chunk {audio_array.shape}"
|
||||
)
|
||||
|
||||
if audio_array.shape[0] > 1:
|
||||
raise ValueError("Onnx model does not support batching")
|
||||
|
||||
if sample_rate != 16000:
|
||||
raise ValueError("Only 16Khz audio is supported")
|
||||
|
||||
ort_inputs = {
|
||||
"input": audio_array.astype(np.float32),
|
||||
"h0": self._h,
|
||||
"c0": self._c,
|
||||
}
|
||||
ort_outs = self.session.run(None, ort_inputs)
|
||||
out, self._h, self._c = ort_outs
|
||||
|
||||
out = out.squeeze(2)[:, 1] # make output type match JIT analog
|
||||
|
||||
return out
|
||||
502
mlu_370-piper/piper/src/python/piper_train/preprocess.py
Normal file
502
mlu_370-piper/piper/src/python/piper_train/preprocess.py
Normal file
@@ -0,0 +1,502 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import csv
|
||||
import dataclasses
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from multiprocessing import JoinableQueue, Process, Queue
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional
|
||||
|
||||
from piper_phonemize import (
|
||||
phonemize_espeak,
|
||||
phonemize_codepoints,
|
||||
phoneme_ids_espeak,
|
||||
phoneme_ids_codepoints,
|
||||
get_codepoints_map,
|
||||
get_espeak_map,
|
||||
get_max_phonemes,
|
||||
tashkeel_run,
|
||||
)
|
||||
|
||||
from .norm_audio import cache_norm_audio, make_silence_detector
|
||||
|
||||
_DIR = Path(__file__).parent
|
||||
_VERSION = (_DIR / "VERSION").read_text(encoding="utf-8").strip()
|
||||
_LOGGER = logging.getLogger("preprocess")
|
||||
|
||||
|
||||
class PhonemeType(str, Enum):
|
||||
ESPEAK = "espeak"
|
||||
"""Phonemes come from espeak-ng"""
|
||||
|
||||
TEXT = "text"
|
||||
"""Phonemes come from text itself"""
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--input-dir", required=True, help="Directory with audio dataset"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
required=True,
|
||||
help="Directory to write output files for training",
|
||||
)
|
||||
parser.add_argument("--language", required=True, help="eSpeak-ng voice")
|
||||
parser.add_argument(
|
||||
"--sample-rate",
|
||||
type=int,
|
||||
required=True,
|
||||
help="Target sample rate for voice (hertz)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset-format", choices=("ljspeech", "mycroft"), required=True
|
||||
)
|
||||
parser.add_argument("--cache-dir", help="Directory to cache processed audio files")
|
||||
parser.add_argument("--max-workers", type=int)
|
||||
parser.add_argument(
|
||||
"--single-speaker", action="store_true", help="Force single speaker dataset"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--speaker-id", type=int, help="Add speaker id to single speaker dataset"
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--phoneme-type",
|
||||
choices=list(PhonemeType),
|
||||
default=PhonemeType.ESPEAK,
|
||||
help="Type of phonemes to use (default: espeak)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--text-casing",
|
||||
choices=("ignore", "lower", "upper", "casefold"),
|
||||
default="ignore",
|
||||
help="Casing applied to utterance text",
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--dataset-name",
|
||||
help="Name of dataset to put in config (default: name of <ouput_dir>/../)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--audio-quality",
|
||||
help="Audio quality to put in config (default: name of <output_dir>)",
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--tashkeel",
|
||||
action="store_true",
|
||||
help="Diacritize Arabic text with libtashkeel",
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--skip-audio", action="store_true", help="Don't preprocess audio"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug", action="store_true", help="Print DEBUG messages to the console"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.single_speaker and (args.speaker_id is not None):
|
||||
_LOGGER.fatal("--single-speaker and --speaker-id cannot both be provided")
|
||||
return
|
||||
|
||||
level = logging.DEBUG if args.debug else logging.INFO
|
||||
logging.basicConfig(level=level)
|
||||
logging.getLogger().setLevel(level)
|
||||
|
||||
# Prevent log spam
|
||||
logging.getLogger("numba").setLevel(logging.WARNING)
|
||||
|
||||
# Ensure enum
|
||||
args.phoneme_type = PhonemeType(args.phoneme_type)
|
||||
|
||||
# Convert to paths and create output directories
|
||||
args.input_dir = Path(args.input_dir)
|
||||
args.output_dir = Path(args.output_dir)
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
args.cache_dir = (
|
||||
Path(args.cache_dir)
|
||||
if args.cache_dir
|
||||
else args.output_dir / "cache" / str(args.sample_rate)
|
||||
)
|
||||
args.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if args.dataset_format == "mycroft":
|
||||
make_dataset = mycroft_dataset
|
||||
else:
|
||||
make_dataset = ljspeech_dataset
|
||||
|
||||
# Count speakers
|
||||
_LOGGER.debug("Counting number of speakers/utterances in the dataset")
|
||||
speaker_counts: "Counter[str]" = Counter()
|
||||
num_utterances = 0
|
||||
for utt in make_dataset(args):
|
||||
speaker = utt.speaker or ""
|
||||
speaker_counts[speaker] += 1
|
||||
num_utterances += 1
|
||||
|
||||
assert num_utterances > 0, "No utterances found"
|
||||
|
||||
is_multispeaker = len(speaker_counts) > 1
|
||||
speaker_ids: Dict[str, int] = {}
|
||||
|
||||
if is_multispeaker:
|
||||
_LOGGER.info("%s speakers detected", len(speaker_counts))
|
||||
|
||||
# Assign speaker ids by most number of utterances first
|
||||
for speaker_id, (speaker, _speaker_count) in enumerate(
|
||||
speaker_counts.most_common()
|
||||
):
|
||||
speaker_ids[speaker] = speaker_id
|
||||
else:
|
||||
_LOGGER.info("Single speaker dataset")
|
||||
|
||||
# Write config
|
||||
audio_quality = args.audio_quality or args.output_dir.name
|
||||
dataset_name = args.dataset_name or args.output_dir.parent.name
|
||||
|
||||
with open(args.output_dir / "config.json", "w", encoding="utf-8") as config_file:
|
||||
json.dump(
|
||||
{
|
||||
"dataset": dataset_name,
|
||||
"audio": {
|
||||
"sample_rate": args.sample_rate,
|
||||
"quality": audio_quality,
|
||||
},
|
||||
"espeak": {
|
||||
"voice": args.language,
|
||||
},
|
||||
"language": {
|
||||
"code": args.language,
|
||||
},
|
||||
"inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
|
||||
"phoneme_type": args.phoneme_type.value,
|
||||
"phoneme_map": {},
|
||||
"phoneme_id_map": get_codepoints_map()[args.language]
|
||||
if args.phoneme_type == PhonemeType.TEXT
|
||||
else get_espeak_map(),
|
||||
"num_symbols": get_max_phonemes(),
|
||||
"num_speakers": len(speaker_counts),
|
||||
"speaker_id_map": speaker_ids,
|
||||
"piper_version": _VERSION,
|
||||
},
|
||||
config_file,
|
||||
ensure_ascii=False,
|
||||
indent=4,
|
||||
)
|
||||
_LOGGER.info("Wrote dataset config")
|
||||
|
||||
if (args.max_workers is None) or (args.max_workers < 1):
|
||||
args.max_workers = os.cpu_count()
|
||||
|
||||
assert args.max_workers is not None
|
||||
|
||||
batch_size = int(num_utterances / (args.max_workers * 2))
|
||||
queue_in: "Queue[Iterable[Utterance]]" = JoinableQueue()
|
||||
queue_out: "Queue[Optional[Utterance]]" = Queue()
|
||||
|
||||
# Start workers
|
||||
if args.phoneme_type == PhonemeType.TEXT:
|
||||
target = phonemize_batch_text
|
||||
else:
|
||||
target = phonemize_batch_espeak
|
||||
|
||||
processes = [
|
||||
Process(target=target, args=(args, queue_in, queue_out))
|
||||
for _ in range(args.max_workers)
|
||||
]
|
||||
for proc in processes:
|
||||
proc.start()
|
||||
|
||||
_LOGGER.info(
|
||||
"Processing %s utterance(s) with %s worker(s)", num_utterances, args.max_workers
|
||||
)
|
||||
with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
|
||||
for utt_batch in batched(
|
||||
make_dataset(args),
|
||||
batch_size,
|
||||
):
|
||||
queue_in.put(utt_batch)
|
||||
|
||||
_LOGGER.debug("Waiting for jobs to finish")
|
||||
missing_phonemes: "Counter[str]" = Counter()
|
||||
for _ in range(num_utterances):
|
||||
utt = queue_out.get()
|
||||
if utt is not None:
|
||||
if utt.speaker is not None:
|
||||
utt.speaker_id = speaker_ids[utt.speaker]
|
||||
|
||||
utt_dict = dataclasses.asdict(utt)
|
||||
utt_dict.pop("missing_phonemes")
|
||||
|
||||
# JSONL
|
||||
json.dump(
|
||||
utt_dict,
|
||||
dataset_file,
|
||||
ensure_ascii=False,
|
||||
cls=PathEncoder,
|
||||
)
|
||||
print("", file=dataset_file)
|
||||
|
||||
missing_phonemes.update(utt.missing_phonemes)
|
||||
|
||||
if missing_phonemes:
|
||||
for phoneme, count in missing_phonemes.most_common():
|
||||
_LOGGER.warning("Missing %s (%s)", phoneme, count)
|
||||
|
||||
_LOGGER.warning("Missing %s phoneme(s)", len(missing_phonemes))
|
||||
|
||||
# Signal workers to stop
|
||||
for proc in processes:
|
||||
queue_in.put(None)
|
||||
|
||||
# Wait for workers to stop
|
||||
for proc in processes:
|
||||
proc.join(timeout=1)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def get_text_casing(casing: str):
|
||||
if casing == "lower":
|
||||
return str.lower
|
||||
|
||||
if casing == "upper":
|
||||
return str.upper
|
||||
|
||||
if casing == "casefold":
|
||||
return str.casefold
|
||||
|
||||
return lambda s: s
|
||||
|
||||
|
||||
def phonemize_batch_espeak(
|
||||
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
|
||||
):
|
||||
try:
|
||||
casing = get_text_casing(args.text_casing)
|
||||
silence_detector = make_silence_detector()
|
||||
|
||||
while True:
|
||||
utt_batch = queue_in.get()
|
||||
if utt_batch is None:
|
||||
break
|
||||
|
||||
for utt in utt_batch:
|
||||
try:
|
||||
if args.tashkeel:
|
||||
utt.text = tashkeel_run(utt.text)
|
||||
|
||||
_LOGGER.debug(utt)
|
||||
all_phonemes = phonemize_espeak(casing(utt.text), args.language)
|
||||
|
||||
# Flatten
|
||||
utt.phonemes = [
|
||||
phoneme
|
||||
for sentence_phonemes in all_phonemes
|
||||
for phoneme in sentence_phonemes
|
||||
]
|
||||
utt.phoneme_ids = phoneme_ids_espeak(
|
||||
utt.phonemes,
|
||||
missing_phonemes=utt.missing_phonemes,
|
||||
)
|
||||
if not args.skip_audio:
|
||||
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
|
||||
utt.audio_path,
|
||||
args.cache_dir,
|
||||
silence_detector,
|
||||
args.sample_rate,
|
||||
)
|
||||
queue_out.put(utt)
|
||||
except TimeoutError:
|
||||
_LOGGER.error("Skipping utterance due to timeout: %s", utt)
|
||||
except Exception:
|
||||
_LOGGER.exception("Failed to process utterance: %s", utt)
|
||||
queue_out.put(None)
|
||||
|
||||
queue_in.task_done()
|
||||
except Exception:
|
||||
_LOGGER.exception("phonemize_batch_espeak")
|
||||
|
||||
|
||||
def phonemize_batch_text(
|
||||
args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
|
||||
):
|
||||
try:
|
||||
casing = get_text_casing(args.text_casing)
|
||||
silence_detector = make_silence_detector()
|
||||
|
||||
while True:
|
||||
utt_batch = queue_in.get()
|
||||
if utt_batch is None:
|
||||
break
|
||||
|
||||
for utt in utt_batch:
|
||||
try:
|
||||
if args.tashkeel:
|
||||
utt.text = tashkeel_run(utt.text)
|
||||
|
||||
_LOGGER.debug(utt)
|
||||
all_phonemes = phonemize_codepoints(casing(utt.text))
|
||||
# Flatten
|
||||
utt.phonemes = [
|
||||
phoneme
|
||||
for sentence_phonemes in all_phonemes
|
||||
for phoneme in sentence_phonemes
|
||||
]
|
||||
utt.phoneme_ids = phoneme_ids_codepoints(
|
||||
args.language,
|
||||
utt.phonemes,
|
||||
missing_phonemes=utt.missing_phonemes,
|
||||
)
|
||||
if not args.skip_audio:
|
||||
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
|
||||
utt.audio_path,
|
||||
args.cache_dir,
|
||||
silence_detector,
|
||||
args.sample_rate,
|
||||
)
|
||||
queue_out.put(utt)
|
||||
except TimeoutError:
|
||||
_LOGGER.error("Skipping utterance due to timeout: %s", utt)
|
||||
except Exception:
|
||||
_LOGGER.exception("Failed to process utterance: %s", utt)
|
||||
queue_out.put(None)
|
||||
|
||||
queue_in.task_done()
|
||||
except Exception:
|
||||
_LOGGER.exception("phonemize_batch_text")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class Utterance:
|
||||
text: str
|
||||
audio_path: Path
|
||||
speaker: Optional[str] = None
|
||||
speaker_id: Optional[int] = None
|
||||
phonemes: Optional[List[str]] = None
|
||||
phoneme_ids: Optional[List[int]] = None
|
||||
audio_norm_path: Optional[Path] = None
|
||||
audio_spec_path: Optional[Path] = None
|
||||
missing_phonemes: "Counter[str]" = field(default_factory=Counter)
|
||||
|
||||
|
||||
class PathEncoder(json.JSONEncoder):
|
||||
def default(self, o):
|
||||
if isinstance(o, Path):
|
||||
return str(o)
|
||||
return super().default(o)
|
||||
|
||||
|
||||
def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
|
||||
dataset_dir = args.input_dir
|
||||
is_single_speaker = args.single_speaker
|
||||
speaker_id = args.speaker_id
|
||||
skip_audio = args.skip_audio
|
||||
|
||||
# filename|speaker|text
|
||||
# speaker is optional
|
||||
metadata_path = dataset_dir / "metadata.csv"
|
||||
assert metadata_path.exists(), f"Missing {metadata_path}"
|
||||
|
||||
wav_dir = dataset_dir / "wav"
|
||||
if not wav_dir.is_dir():
|
||||
wav_dir = dataset_dir / "wavs"
|
||||
|
||||
with open(metadata_path, "r", encoding="utf-8") as csv_file:
|
||||
reader = csv.reader(csv_file, delimiter="|")
|
||||
for row in reader:
|
||||
assert len(row) >= 2, "Not enough columns"
|
||||
|
||||
speaker: Optional[str] = None
|
||||
if is_single_speaker or (len(row) == 2):
|
||||
filename, text = row[0], row[-1]
|
||||
else:
|
||||
filename, speaker, text = row[0], row[1], row[-1]
|
||||
|
||||
# Try file name relative to metadata
|
||||
wav_path = metadata_path.parent / filename
|
||||
|
||||
if not wav_path.exists():
|
||||
# Try with .wav
|
||||
wav_path = metadata_path.parent / f"{filename}.wav"
|
||||
|
||||
if not wav_path.exists():
|
||||
# Try wav/ or wavs/
|
||||
wav_path = wav_dir / filename
|
||||
|
||||
if not wav_path.exists():
|
||||
# Try with .wav
|
||||
wav_path = wav_dir / f"{filename}.wav"
|
||||
|
||||
if not skip_audio:
|
||||
if not wav_path.exists():
|
||||
_LOGGER.warning("Missing %s", filename)
|
||||
continue
|
||||
|
||||
if wav_path.stat().st_size == 0:
|
||||
_LOGGER.warning("Empty file: %s", wav_path)
|
||||
continue
|
||||
|
||||
yield Utterance(
|
||||
text=text, audio_path=wav_path, speaker=speaker, speaker_id=speaker_id
|
||||
)
|
||||
|
||||
|
||||
def mycroft_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
|
||||
dataset_dir = args.input_dir
|
||||
is_single_speaker = args.single_speaker
|
||||
skip_audio = args.skip_audio
|
||||
|
||||
speaker_id = 0
|
||||
for metadata_path in dataset_dir.glob("**/*-metadata.txt"):
|
||||
speaker = metadata_path.parent.name if not is_single_speaker else None
|
||||
with open(metadata_path, "r", encoding="utf-8") as csv_file:
|
||||
# filename|text|length
|
||||
reader = csv.reader(csv_file, delimiter="|")
|
||||
for row in reader:
|
||||
filename, text = row[0], row[1]
|
||||
wav_path = metadata_path.parent / filename
|
||||
if skip_audio or (wav_path.exists() and (wav_path.stat().st_size > 0)):
|
||||
yield Utterance(
|
||||
text=text,
|
||||
audio_path=wav_path,
|
||||
speaker=speaker,
|
||||
speaker_id=speaker_id if not is_single_speaker else None,
|
||||
)
|
||||
speaker_id += 1
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def batched(iterable, n):
|
||||
"Batch data into lists of length n. The last batch may be shorter."
|
||||
# batched('ABCDEFG', 3) --> ABC DEF G
|
||||
if n < 1:
|
||||
raise ValueError("n must be at least one")
|
||||
it = iter(iterable)
|
||||
batch = list(itertools.islice(it, n))
|
||||
while batch:
|
||||
yield batch
|
||||
batch = list(itertools.islice(it, n))
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
mlu_370-piper/piper/src/python/piper_train/py.typed
Normal file
0
mlu_370-piper/piper/src/python/piper_train/py.typed
Normal file
40
mlu_370-piper/piper/src/python/piper_train/pylintrc
Normal file
40
mlu_370-piper/piper/src/python/piper_train/pylintrc
Normal file
@@ -0,0 +1,40 @@
|
||||
[MESSAGES CONTROL]
|
||||
disable=
|
||||
format,
|
||||
abstract-class-little-used,
|
||||
abstract-method,
|
||||
cyclic-import,
|
||||
duplicate-code,
|
||||
global-statement,
|
||||
import-outside-toplevel,
|
||||
inconsistent-return-statements,
|
||||
locally-disabled,
|
||||
not-context-manager,
|
||||
redefined-variable-type,
|
||||
too-few-public-methods,
|
||||
too-many-arguments,
|
||||
too-many-branches,
|
||||
too-many-instance-attributes,
|
||||
too-many-lines,
|
||||
too-many-locals,
|
||||
too-many-public-methods,
|
||||
too-many-return-statements,
|
||||
too-many-statements,
|
||||
too-many-boolean-expressions,
|
||||
unnecessary-pass,
|
||||
unused-argument,
|
||||
broad-except,
|
||||
too-many-nested-blocks,
|
||||
invalid-name,
|
||||
unused-import,
|
||||
no-self-use,
|
||||
fixme,
|
||||
useless-super-delegation,
|
||||
missing-module-docstring,
|
||||
missing-class-docstring,
|
||||
missing-function-docstring,
|
||||
import-error,
|
||||
relative-beyond-top-level
|
||||
|
||||
[FORMAT]
|
||||
expected-line-ending-format=LF
|
||||
43
mlu_370-piper/piper/src/python/piper_train/select_speaker.py
Normal file
43
mlu_370-piper/piper/src/python/piper_train/select_speaker.py
Normal file
@@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import csv
|
||||
import sys
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--speaker-number", type=int)
|
||||
parser.add_argument("--speaker-name")
|
||||
args = parser.parse_args()
|
||||
|
||||
assert (args.speaker_number is not None) or (args.speaker_name is not None)
|
||||
|
||||
reader = csv.reader(sys.stdin, delimiter="|")
|
||||
writer = csv.writer(sys.stdout, delimiter="|")
|
||||
|
||||
if args.speaker_name is not None:
|
||||
for row in reader:
|
||||
audio, speaker_id, text = row[0], row[1], row[-1]
|
||||
if args.speaker_name == speaker_id:
|
||||
writer.writerow((audio, text))
|
||||
else:
|
||||
utterances = defaultdict(list)
|
||||
counts = Counter()
|
||||
for row in reader:
|
||||
audio, speaker_id, text = row[0], row[1], row[-1]
|
||||
utterances[speaker_id].append((audio, text))
|
||||
counts[speaker_id] += 1
|
||||
|
||||
writer = csv.writer(sys.stdout, delimiter="|")
|
||||
for i, (speaker_id, _count) in enumerate(counts.most_common()):
|
||||
if i == args.speaker_number:
|
||||
for row in utterances[speaker_id]:
|
||||
writer.writerow(row)
|
||||
|
||||
print(speaker_id, file=sys.stderr)
|
||||
break
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
22
mlu_370-piper/piper/src/python/piper_train/setup.cfg
Normal file
22
mlu_370-piper/piper/src/python/piper_train/setup.cfg
Normal file
@@ -0,0 +1,22 @@
|
||||
[flake8]
|
||||
# To work with Black
|
||||
max-line-length = 88
|
||||
# E501: line too long
|
||||
# W503: Line break occurred before a binary operator
|
||||
# E203: Whitespace before ':'
|
||||
# D202 No blank lines allowed after function docstring
|
||||
# W504 line break after binary operator
|
||||
ignore =
|
||||
E501,
|
||||
W503,
|
||||
E203,
|
||||
D202,
|
||||
W504
|
||||
|
||||
[isort]
|
||||
multi_line_output = 3
|
||||
include_trailing_comma=True
|
||||
force_grid_wrap=0
|
||||
use_parentheses=True
|
||||
line_length=88
|
||||
indent = " "
|
||||
427
mlu_370-piper/piper/src/python/piper_train/vits/attentions.py
Normal file
427
mlu_370-piper/piper/src/python/piper_train/vits/attentions.py
Normal file
@@ -0,0 +1,427 @@
|
||||
import math
|
||||
import typing
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from .commons import subsequent_mask
|
||||
from .modules import LayerNorm
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
hidden_channels: int,
|
||||
filter_channels: int,
|
||||
n_heads: int,
|
||||
n_layers: int,
|
||||
kernel_size: int = 1,
|
||||
p_dropout: float = 0.0,
|
||||
window_size: int = 4,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__()
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.window_size = window_size
|
||||
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
self.attn_layers = nn.ModuleList()
|
||||
self.norm_layers_1 = nn.ModuleList()
|
||||
self.ffn_layers = nn.ModuleList()
|
||||
self.norm_layers_2 = nn.ModuleList()
|
||||
for i in range(self.n_layers):
|
||||
self.attn_layers.append(
|
||||
MultiHeadAttention(
|
||||
hidden_channels,
|
||||
hidden_channels,
|
||||
n_heads,
|
||||
p_dropout=p_dropout,
|
||||
window_size=window_size,
|
||||
)
|
||||
)
|
||||
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
||||
self.ffn_layers.append(
|
||||
FFN(
|
||||
hidden_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
kernel_size,
|
||||
p_dropout=p_dropout,
|
||||
)
|
||||
)
|
||||
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
||||
x = x * x_mask
|
||||
for attn_layer, norm_layer_1, ffn_layer, norm_layer_2 in zip(
|
||||
self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2
|
||||
):
|
||||
y = attn_layer(x, x, attn_mask)
|
||||
y = self.drop(y)
|
||||
x = norm_layer_1(x + y)
|
||||
|
||||
y = ffn_layer(x, x_mask)
|
||||
y = self.drop(y)
|
||||
x = norm_layer_2(x + y)
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
hidden_channels: int,
|
||||
filter_channels: int,
|
||||
n_heads: int,
|
||||
n_layers: int,
|
||||
kernel_size: int = 1,
|
||||
p_dropout: float = 0.0,
|
||||
proximal_bias: bool = False,
|
||||
proximal_init: bool = True,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__()
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.proximal_bias = proximal_bias
|
||||
self.proximal_init = proximal_init
|
||||
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
self.self_attn_layers = nn.ModuleList()
|
||||
self.norm_layers_0 = nn.ModuleList()
|
||||
self.encdec_attn_layers = nn.ModuleList()
|
||||
self.norm_layers_1 = nn.ModuleList()
|
||||
self.ffn_layers = nn.ModuleList()
|
||||
self.norm_layers_2 = nn.ModuleList()
|
||||
for i in range(self.n_layers):
|
||||
self.self_attn_layers.append(
|
||||
MultiHeadAttention(
|
||||
hidden_channels,
|
||||
hidden_channels,
|
||||
n_heads,
|
||||
p_dropout=p_dropout,
|
||||
proximal_bias=proximal_bias,
|
||||
proximal_init=proximal_init,
|
||||
)
|
||||
)
|
||||
self.norm_layers_0.append(LayerNorm(hidden_channels))
|
||||
self.encdec_attn_layers.append(
|
||||
MultiHeadAttention(
|
||||
hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
|
||||
)
|
||||
)
|
||||
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
||||
self.ffn_layers.append(
|
||||
FFN(
|
||||
hidden_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
kernel_size,
|
||||
p_dropout=p_dropout,
|
||||
causal=True,
|
||||
)
|
||||
)
|
||||
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
||||
|
||||
def forward(self, x, x_mask, h, h_mask):
|
||||
"""
|
||||
x: decoder input
|
||||
h: encoder output
|
||||
"""
|
||||
self_attn_mask = subsequent_mask(x_mask.size(2)).type_as(x)
|
||||
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
||||
x = x * x_mask
|
||||
for i in range(self.n_layers):
|
||||
y = self.self_attn_layers[i](x, x, self_attn_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_0[i](x + y)
|
||||
|
||||
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_1[i](x + y)
|
||||
|
||||
y = self.ffn_layers[i](x, x_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_2[i](x + y)
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class MultiHeadAttention(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
channels: int,
|
||||
out_channels: int,
|
||||
n_heads: int,
|
||||
p_dropout: float = 0.0,
|
||||
window_size: typing.Optional[int] = None,
|
||||
heads_share: bool = True,
|
||||
block_length: typing.Optional[int] = None,
|
||||
proximal_bias: bool = False,
|
||||
proximal_init: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
assert channels % n_heads == 0
|
||||
|
||||
self.channels = channels
|
||||
self.out_channels = out_channels
|
||||
self.n_heads = n_heads
|
||||
self.p_dropout = p_dropout
|
||||
self.window_size = window_size
|
||||
self.heads_share = heads_share
|
||||
self.block_length = block_length
|
||||
self.proximal_bias = proximal_bias
|
||||
self.proximal_init = proximal_init
|
||||
self.attn = torch.zeros(1)
|
||||
|
||||
self.k_channels = channels // n_heads
|
||||
self.conv_q = nn.Conv1d(channels, channels, 1)
|
||||
self.conv_k = nn.Conv1d(channels, channels, 1)
|
||||
self.conv_v = nn.Conv1d(channels, channels, 1)
|
||||
self.conv_o = nn.Conv1d(channels, out_channels, 1)
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
|
||||
if window_size is not None:
|
||||
n_heads_rel = 1 if heads_share else n_heads
|
||||
rel_stddev = self.k_channels**-0.5
|
||||
self.emb_rel_k = nn.Parameter(
|
||||
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
|
||||
* rel_stddev
|
||||
)
|
||||
self.emb_rel_v = nn.Parameter(
|
||||
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
|
||||
* rel_stddev
|
||||
)
|
||||
|
||||
nn.init.xavier_uniform_(self.conv_q.weight)
|
||||
nn.init.xavier_uniform_(self.conv_k.weight)
|
||||
nn.init.xavier_uniform_(self.conv_v.weight)
|
||||
if proximal_init:
|
||||
with torch.no_grad():
|
||||
self.conv_k.weight.copy_(self.conv_q.weight)
|
||||
self.conv_k.bias.copy_(self.conv_q.bias)
|
||||
|
||||
def forward(self, x, c, attn_mask=None):
|
||||
q = self.conv_q(x)
|
||||
k = self.conv_k(c)
|
||||
v = self.conv_v(c)
|
||||
|
||||
x, self.attn = self.attention(q, k, v, mask=attn_mask)
|
||||
|
||||
x = self.conv_o(x)
|
||||
return x
|
||||
|
||||
def attention(self, query, key, value, mask=None):
|
||||
# reshape [b, d, t] -> [b, n_h, t, d_k]
|
||||
b, d, t_s, t_t = (key.size(0), key.size(1), key.size(2), query.size(2))
|
||||
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
|
||||
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
||||
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
||||
|
||||
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
|
||||
if self.window_size is not None:
|
||||
assert (
|
||||
t_s == t_t
|
||||
), "Relative attention is only available for self-attention."
|
||||
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
|
||||
rel_logits = self._matmul_with_relative_keys(
|
||||
query / math.sqrt(self.k_channels), key_relative_embeddings
|
||||
)
|
||||
scores_local = self._relative_position_to_absolute_position(rel_logits)
|
||||
scores = scores + scores_local
|
||||
if self.proximal_bias:
|
||||
assert t_s == t_t, "Proximal bias is only available for self-attention."
|
||||
scores = scores + self._attention_bias_proximal(t_s).type_as(scores)
|
||||
if mask is not None:
|
||||
scores = scores.masked_fill(mask == 0, -1e4)
|
||||
if self.block_length is not None:
|
||||
assert (
|
||||
t_s == t_t
|
||||
), "Local attention is only available for self-attention."
|
||||
block_mask = (
|
||||
torch.ones_like(scores)
|
||||
.triu(-self.block_length)
|
||||
.tril(self.block_length)
|
||||
)
|
||||
scores = scores.masked_fill(block_mask == 0, -1e4)
|
||||
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
|
||||
p_attn = self.drop(p_attn)
|
||||
output = torch.matmul(p_attn, value)
|
||||
if self.window_size is not None:
|
||||
relative_weights = self._absolute_position_to_relative_position(p_attn)
|
||||
value_relative_embeddings = self._get_relative_embeddings(
|
||||
self.emb_rel_v, t_s
|
||||
)
|
||||
output = output + self._matmul_with_relative_values(
|
||||
relative_weights, value_relative_embeddings
|
||||
)
|
||||
output = (
|
||||
output.transpose(2, 3).contiguous().view(b, d, t_t)
|
||||
) # [b, n_h, t_t, d_k] -> [b, d, t_t]
|
||||
return output, p_attn
|
||||
|
||||
def _matmul_with_relative_values(self, x, y):
|
||||
"""
|
||||
x: [b, h, l, m]
|
||||
y: [h or 1, m, d]
|
||||
ret: [b, h, l, d]
|
||||
"""
|
||||
ret = torch.matmul(x, y.unsqueeze(0))
|
||||
return ret
|
||||
|
||||
def _matmul_with_relative_keys(self, x, y):
|
||||
"""
|
||||
x: [b, h, l, d]
|
||||
y: [h or 1, m, d]
|
||||
ret: [b, h, l, m]
|
||||
"""
|
||||
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
|
||||
return ret
|
||||
|
||||
def _get_relative_embeddings(self, relative_embeddings, length: int):
|
||||
# max_relative_position = 2 * self.window_size + 1
|
||||
# Pad first before slice to avoid using cond ops.
|
||||
pad_length = max(length - (self.window_size + 1), 0)
|
||||
slice_start_position = max((self.window_size + 1) - length, 0)
|
||||
slice_end_position = slice_start_position + 2 * length - 1
|
||||
if pad_length > 0:
|
||||
padded_relative_embeddings = F.pad(
|
||||
relative_embeddings,
|
||||
# convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
|
||||
(0, 0, pad_length, pad_length, 0, 0),
|
||||
)
|
||||
else:
|
||||
padded_relative_embeddings = relative_embeddings
|
||||
used_relative_embeddings = padded_relative_embeddings[
|
||||
:, slice_start_position:slice_end_position
|
||||
]
|
||||
return used_relative_embeddings
|
||||
|
||||
def _relative_position_to_absolute_position(self, x):
|
||||
"""
|
||||
x: [b, h, l, 2*l-1]
|
||||
ret: [b, h, l, l]
|
||||
"""
|
||||
batch, heads, length, _ = x.size()
|
||||
|
||||
# Concat columns of pad to shift from relative to absolute indexing.
|
||||
# x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
|
||||
x = F.pad(x, (0, 1, 0, 0, 0, 0, 0, 0))
|
||||
|
||||
# Concat extra elements so to add up to shape (len+1, 2*len-1).
|
||||
x_flat = x.view([batch, heads, length * 2 * length])
|
||||
# x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))
|
||||
x_flat = F.pad(x_flat, (0, length - 1, 0, 0, 0, 0))
|
||||
|
||||
# Reshape and slice out the padded elements.
|
||||
x_final = x_flat.view([batch, heads, length + 1, (2 * length) - 1])[
|
||||
:, :, :length, length - 1 :
|
||||
]
|
||||
return x_final
|
||||
|
||||
def _absolute_position_to_relative_position(self, x):
|
||||
"""
|
||||
x: [b, h, l, l]
|
||||
ret: [b, h, l, 2*l-1]
|
||||
"""
|
||||
batch, heads, length, _ = x.size()
|
||||
|
||||
# padd along column
|
||||
# x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
|
||||
x = F.pad(x, (0, length - 1, 0, 0, 0, 0, 0, 0))
|
||||
x_flat = x.view([batch, heads, (length * length) + (length * (length - 1))])
|
||||
# add 0's in the beginning that will skew the elements after reshape
|
||||
# x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
|
||||
x_flat = F.pad(x_flat, (length, 0, 0, 0, 0, 0))
|
||||
x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
|
||||
return x_final
|
||||
|
||||
def _attention_bias_proximal(self, length: int):
|
||||
"""Bias for self-attention to encourage attention to close positions.
|
||||
Args:
|
||||
length: an integer scalar.
|
||||
Returns:
|
||||
a Tensor with shape [1, 1, length, length]
|
||||
"""
|
||||
r = torch.arange(length, dtype=torch.float32)
|
||||
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
|
||||
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
|
||||
|
||||
|
||||
class FFN(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
out_channels: int,
|
||||
filter_channels: int,
|
||||
kernel_size: int,
|
||||
p_dropout: float = 0.0,
|
||||
activation: str = "",
|
||||
causal: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.activation = activation
|
||||
self.causal = causal
|
||||
|
||||
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
|
||||
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
if self.causal:
|
||||
padding1 = self._causal_padding(x * x_mask)
|
||||
else:
|
||||
padding1 = self._same_padding(x * x_mask)
|
||||
|
||||
x = self.conv_1(padding1)
|
||||
|
||||
if self.activation == "gelu":
|
||||
x = x * torch.sigmoid(1.702 * x)
|
||||
else:
|
||||
x = torch.relu(x)
|
||||
x = self.drop(x)
|
||||
|
||||
if self.causal:
|
||||
padding2 = self._causal_padding(x * x_mask)
|
||||
else:
|
||||
padding2 = self._same_padding(x * x_mask)
|
||||
|
||||
x = self.conv_2(padding2)
|
||||
|
||||
return x * x_mask
|
||||
|
||||
def _causal_padding(self, x):
|
||||
if self.kernel_size == 1:
|
||||
return x
|
||||
pad_l = self.kernel_size - 1
|
||||
pad_r = 0
|
||||
# padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
||||
# x = F.pad(x, convert_pad_shape(padding))
|
||||
x = F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
|
||||
return x
|
||||
|
||||
def _same_padding(self, x):
|
||||
if self.kernel_size == 1:
|
||||
return x
|
||||
pad_l = (self.kernel_size - 1) // 2
|
||||
pad_r = self.kernel_size // 2
|
||||
# padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
||||
# x = F.pad(x, convert_pad_shape(padding))
|
||||
x = F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
|
||||
return x
|
||||
147
mlu_370-piper/piper/src/python/piper_train/vits/commons.py
Normal file
147
mlu_370-piper/piper/src/python/piper_train/vits/commons.py
Normal file
@@ -0,0 +1,147 @@
|
||||
import logging
|
||||
import math
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
_LOGGER = logging.getLogger("vits.commons")
|
||||
|
||||
|
||||
def init_weights(m, mean=0.0, std=0.01):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find("Conv") != -1:
|
||||
m.weight.data.normal_(mean, std)
|
||||
|
||||
|
||||
def get_padding(kernel_size, dilation=1):
|
||||
return int((kernel_size * dilation - dilation) / 2)
|
||||
|
||||
|
||||
def intersperse(lst, item):
|
||||
result = [item] * (len(lst) * 2 + 1)
|
||||
result[1::2] = lst
|
||||
return result
|
||||
|
||||
|
||||
def kl_divergence(m_p, logs_p, m_q, logs_q):
|
||||
"""KL(P||Q)"""
|
||||
kl = (logs_q - logs_p) - 0.5
|
||||
kl += (
|
||||
0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
|
||||
)
|
||||
return kl
|
||||
|
||||
|
||||
def rand_gumbel(shape):
|
||||
"""Sample from the Gumbel distribution, protect from overflows."""
|
||||
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
|
||||
return -torch.log(-torch.log(uniform_samples))
|
||||
|
||||
|
||||
def rand_gumbel_like(x):
|
||||
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
|
||||
return g
|
||||
|
||||
|
||||
def slice_segments(x, ids_str, segment_size=4):
|
||||
ret = torch.zeros_like(x[:, :, :segment_size])
|
||||
for i in range(x.size(0)):
|
||||
idx_str = max(0, ids_str[i])
|
||||
idx_end = idx_str + segment_size
|
||||
ret[i] = x[i, :, idx_str:idx_end]
|
||||
return ret
|
||||
|
||||
|
||||
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
||||
b, d, t = x.size()
|
||||
if x_lengths is None:
|
||||
x_lengths = t
|
||||
ids_str_max = x_lengths - segment_size + 1
|
||||
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
||||
ret = slice_segments(x, ids_str, segment_size)
|
||||
return ret, ids_str
|
||||
|
||||
|
||||
def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
|
||||
position = torch.arange(length, dtype=torch.float)
|
||||
num_timescales = channels // 2
|
||||
log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
|
||||
num_timescales - 1
|
||||
)
|
||||
inv_timescales = min_timescale * torch.exp(
|
||||
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
|
||||
)
|
||||
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
|
||||
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
|
||||
signal = F.pad(signal, [0, 0, 0, channels % 2])
|
||||
signal = signal.view(1, channels, length)
|
||||
return signal
|
||||
|
||||
|
||||
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
|
||||
b, channels, length = x.size()
|
||||
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
||||
return x + signal.to(dtype=x.dtype, device=x.device)
|
||||
|
||||
|
||||
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
|
||||
b, channels, length = x.size()
|
||||
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
||||
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
|
||||
|
||||
|
||||
def subsequent_mask(length: int):
|
||||
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
|
||||
return mask
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
||||
n_channels_int = n_channels[0]
|
||||
in_act = input_a + input_b
|
||||
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
||||
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
||||
acts = t_act * s_act
|
||||
return acts
|
||||
|
||||
|
||||
def sequence_mask(length, max_length: Optional[int] = None):
|
||||
if max_length is None:
|
||||
max_length = length.max()
|
||||
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
||||
return x.unsqueeze(0) < length.unsqueeze(1)
|
||||
|
||||
|
||||
def generate_path(duration, mask):
|
||||
"""
|
||||
duration: [b, 1, t_x]
|
||||
mask: [b, 1, t_y, t_x]
|
||||
"""
|
||||
b, _, t_y, t_x = mask.shape
|
||||
cum_duration = torch.cumsum(duration, -1)
|
||||
|
||||
cum_duration_flat = cum_duration.view(b * t_x)
|
||||
path = sequence_mask(cum_duration_flat, t_y).type_as(mask)
|
||||
path = path.view(b, t_x, t_y)
|
||||
path = path - F.pad(path, (0, 0, 1, 0, 0, 0))[:, :-1]
|
||||
path = path.unsqueeze(1).transpose(2, 3) * mask
|
||||
return path
|
||||
|
||||
|
||||
def clip_grad_value_(parameters, clip_value, norm_type=2):
|
||||
if isinstance(parameters, torch.Tensor):
|
||||
parameters = [parameters]
|
||||
parameters = list(filter(lambda p: p.grad is not None, parameters))
|
||||
norm_type = float(norm_type)
|
||||
if clip_value is not None:
|
||||
clip_value = float(clip_value)
|
||||
|
||||
total_norm = 0
|
||||
for p in parameters:
|
||||
param_norm = p.grad.data.norm(norm_type)
|
||||
total_norm += param_norm.item() ** norm_type
|
||||
if clip_value is not None:
|
||||
p.grad.data.clamp_(min=-clip_value, max=clip_value)
|
||||
total_norm = total_norm ** (1.0 / norm_type)
|
||||
return total_norm
|
||||
330
mlu_370-piper/piper/src/python/piper_train/vits/config.py
Normal file
330
mlu_370-piper/piper/src/python/piper_train/vits/config.py
Normal file
@@ -0,0 +1,330 @@
|
||||
"""Configuration classes"""
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, Tuple
|
||||
|
||||
|
||||
@dataclass
|
||||
class MelAudioConfig:
|
||||
filter_length: int = 1024
|
||||
hop_length: int = 256
|
||||
win_length: int = 1024
|
||||
mel_channels: int = 80
|
||||
sample_rate: int = 22050
|
||||
sample_bytes: int = 2
|
||||
channels: int = 1
|
||||
mel_fmin: float = 0.0
|
||||
mel_fmax: Optional[float] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelAudioConfig:
|
||||
resblock: str
|
||||
resblock_kernel_sizes: Tuple[int, ...]
|
||||
resblock_dilation_sizes: Tuple[Tuple[int, ...], ...]
|
||||
upsample_rates: Tuple[int, ...]
|
||||
upsample_initial_channel: int
|
||||
upsample_kernel_sizes: Tuple[int, ...]
|
||||
|
||||
@staticmethod
|
||||
def low_quality() -> "ModelAudioConfig":
|
||||
return ModelAudioConfig(
|
||||
resblock="2",
|
||||
resblock_kernel_sizes=(3, 5, 7),
|
||||
resblock_dilation_sizes=(
|
||||
(1, 2),
|
||||
(2, 6),
|
||||
(3, 12),
|
||||
),
|
||||
upsample_rates=(8, 8, 4),
|
||||
upsample_initial_channel=256,
|
||||
upsample_kernel_sizes=(16, 16, 8),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def high_quality() -> "ModelAudioConfig":
|
||||
return ModelAudioConfig(
|
||||
resblock="1",
|
||||
resblock_kernel_sizes=(3, 7, 11),
|
||||
resblock_dilation_sizes=(
|
||||
(1, 3, 5),
|
||||
(1, 3, 5),
|
||||
(1, 3, 5),
|
||||
),
|
||||
upsample_rates=(8, 8, 2, 2),
|
||||
upsample_initial_channel=512,
|
||||
upsample_kernel_sizes=(16, 16, 4, 4),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelConfig:
|
||||
num_symbols: int
|
||||
n_speakers: int
|
||||
audio: ModelAudioConfig
|
||||
mel: MelAudioConfig = field(default_factory=MelAudioConfig)
|
||||
|
||||
inter_channels: int = 192
|
||||
hidden_channels: int = 192
|
||||
filter_channels: int = 768
|
||||
n_heads: int = 2
|
||||
n_layers: int = 6
|
||||
kernel_size: int = 3
|
||||
p_dropout: float = 0.1
|
||||
n_layers_q: int = 3
|
||||
use_spectral_norm: bool = False
|
||||
gin_channels: int = 0 # single speaker
|
||||
use_sdp: bool = True # StochasticDurationPredictor
|
||||
segment_size: int = 8192
|
||||
|
||||
@property
|
||||
def is_multispeaker(self) -> bool:
|
||||
return self.n_speakers > 1
|
||||
|
||||
@property
|
||||
def resblock(self) -> str:
|
||||
return self.audio.resblock
|
||||
|
||||
@property
|
||||
def resblock_kernel_sizes(self) -> Tuple[int, ...]:
|
||||
return self.audio.resblock_kernel_sizes
|
||||
|
||||
@property
|
||||
def resblock_dilation_sizes(self) -> Tuple[Tuple[int, ...], ...]:
|
||||
return self.audio.resblock_dilation_sizes
|
||||
|
||||
@property
|
||||
def upsample_rates(self) -> Tuple[int, ...]:
|
||||
return self.audio.upsample_rates
|
||||
|
||||
@property
|
||||
def upsample_initial_channel(self) -> int:
|
||||
return self.audio.upsample_initial_channel
|
||||
|
||||
@property
|
||||
def upsample_kernel_sizes(self) -> Tuple[int, ...]:
|
||||
return self.audio.upsample_kernel_sizes
|
||||
|
||||
def __post_init__(self):
|
||||
if self.is_multispeaker and (self.gin_channels == 0):
|
||||
self.gin_channels = 512
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainingConfig:
|
||||
learning_rate: float = 2e-4
|
||||
betas: Tuple[float, float] = field(default=(0.8, 0.99))
|
||||
eps: float = 1e-9
|
||||
# batch_size: int = 32
|
||||
fp16_run: bool = False
|
||||
lr_decay: float = 0.999875
|
||||
init_lr_ratio: float = 1.0
|
||||
warmup_epochs: int = 0
|
||||
c_mel: int = 45
|
||||
c_kl: float = 1.0
|
||||
grad_clip: Optional[float] = None
|
||||
|
||||
|
||||
# @dataclass
|
||||
# class PhonemesConfig(DataClassJsonMixin):
|
||||
# phoneme_separator: str = " "
|
||||
# """Separator between individual phonemes in CSV input"""
|
||||
|
||||
# word_separator: str = "#"
|
||||
# """Separator between word phonemes in CSV input (must not match phoneme_separator)"""
|
||||
|
||||
# phoneme_to_id: typing.Optional[typing.Dict[str, int]] = None
|
||||
# pad: typing.Optional[str] = "_"
|
||||
# bos: typing.Optional[str] = None
|
||||
# eos: typing.Optional[str] = None
|
||||
# blank: typing.Optional[str] = "#"
|
||||
# blank_word: typing.Optional[str] = None
|
||||
# blank_between: typing.Union[str, BlankBetween] = BlankBetween.WORDS
|
||||
# blank_at_start: bool = True
|
||||
# blank_at_end: bool = True
|
||||
# simple_punctuation: bool = True
|
||||
# punctuation_map: typing.Optional[typing.Dict[str, str]] = None
|
||||
# separate: typing.Optional[typing.List[str]] = None
|
||||
# separate_graphemes: bool = False
|
||||
# separate_tones: bool = False
|
||||
# tone_before: bool = False
|
||||
# phoneme_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None
|
||||
# auto_bos_eos: bool = False
|
||||
# minor_break: typing.Optional[str] = IPA.BREAK_MINOR.value
|
||||
# major_break: typing.Optional[str] = IPA.BREAK_MAJOR.value
|
||||
# break_phonemes_into_graphemes: bool = False
|
||||
# break_phonemes_into_codepoints: bool = False
|
||||
# drop_stress: bool = False
|
||||
# symbols: typing.Optional[typing.List[str]] = None
|
||||
|
||||
# def split_word_phonemes(self, phonemes_str: str) -> typing.List[typing.List[str]]:
|
||||
# """Split phonemes string into a list of lists (outer is words, inner is individual phonemes in each word)"""
|
||||
# return [
|
||||
# word_phonemes_str.split(self.phoneme_separator)
|
||||
# if self.phoneme_separator
|
||||
# else list(word_phonemes_str)
|
||||
# for word_phonemes_str in phonemes_str.split(self.word_separator)
|
||||
# ]
|
||||
|
||||
# def join_word_phonemes(self, word_phonemes: typing.List[typing.List[str]]) -> str:
|
||||
# """Split phonemes string into a list of lists (outer is words, inner is individual phonemes in each word)"""
|
||||
# return self.word_separator.join(
|
||||
# self.phoneme_separator.join(wp) for wp in word_phonemes
|
||||
# )
|
||||
|
||||
|
||||
# class Phonemizer(str, Enum):
|
||||
# SYMBOLS = "symbols"
|
||||
# GRUUT = "gruut"
|
||||
# ESPEAK = "espeak"
|
||||
# EPITRAN = "epitran"
|
||||
|
||||
|
||||
# class Aligner(str, Enum):
|
||||
# KALDI_ALIGN = "kaldi_align"
|
||||
|
||||
|
||||
# class TextCasing(str, Enum):
|
||||
# LOWER = "lower"
|
||||
# UPPER = "upper"
|
||||
|
||||
|
||||
# class MetadataFormat(str, Enum):
|
||||
# TEXT = "text"
|
||||
# PHONEMES = "phonemes"
|
||||
# PHONEME_IDS = "ids"
|
||||
|
||||
|
||||
# @dataclass
|
||||
# class DatasetConfig:
|
||||
# name: str
|
||||
# metadata_format: MetadataFormat = MetadataFormat.TEXT
|
||||
# multispeaker: bool = False
|
||||
# text_language: typing.Optional[str] = None
|
||||
# audio_dir: typing.Optional[typing.Union[str, Path]] = None
|
||||
# cache_dir: typing.Optional[typing.Union[str, Path]] = None
|
||||
|
||||
# def get_cache_dir(self, output_dir: typing.Union[str, Path]) -> Path:
|
||||
# if self.cache_dir is not None:
|
||||
# cache_dir = Path(self.cache_dir)
|
||||
# else:
|
||||
# cache_dir = Path("cache") / self.name
|
||||
|
||||
# if not cache_dir.is_absolute():
|
||||
# cache_dir = Path(output_dir) / str(cache_dir)
|
||||
|
||||
# return cache_dir
|
||||
|
||||
|
||||
# @dataclass
|
||||
# class AlignerConfig:
|
||||
# aligner: typing.Optional[Aligner] = None
|
||||
# casing: typing.Optional[TextCasing] = None
|
||||
|
||||
|
||||
# @dataclass
|
||||
# class InferenceConfig:
|
||||
# length_scale: float = 1.0
|
||||
# noise_scale: float = 0.667
|
||||
# noise_w: float = 0.8
|
||||
|
||||
|
||||
# @dataclass
|
||||
# class TrainingConfig(DataClassJsonMixin):
|
||||
# seed: int = 1234
|
||||
# epochs: int = 10000
|
||||
# learning_rate: float = 2e-4
|
||||
# betas: typing.Tuple[float, float] = field(default=(0.8, 0.99))
|
||||
# eps: float = 1e-9
|
||||
# batch_size: int = 32
|
||||
# fp16_run: bool = False
|
||||
# lr_decay: float = 0.999875
|
||||
# segment_size: int = 8192
|
||||
# init_lr_ratio: float = 1.0
|
||||
# warmup_epochs: int = 0
|
||||
# c_mel: int = 45
|
||||
# c_kl: float = 1.0
|
||||
# grad_clip: typing.Optional[float] = None
|
||||
|
||||
# min_seq_length: typing.Optional[int] = None
|
||||
# max_seq_length: typing.Optional[int] = None
|
||||
|
||||
# min_spec_length: typing.Optional[int] = None
|
||||
# max_spec_length: typing.Optional[int] = None
|
||||
|
||||
# min_speaker_utterances: typing.Optional[int] = None
|
||||
|
||||
# last_epoch: int = 1
|
||||
# global_step: int = 1
|
||||
# best_loss: typing.Optional[float] = None
|
||||
# audio: AudioConfig = field(default_factory=AudioConfig)
|
||||
# model: ModelConfig = field(default_factory=ModelConfig)
|
||||
# phonemes: PhonemesConfig = field(default_factory=PhonemesConfig)
|
||||
# text_aligner: AlignerConfig = field(default_factory=AlignerConfig)
|
||||
# text_language: typing.Optional[str] = None
|
||||
# phonemizer: typing.Optional[Phonemizer] = None
|
||||
# datasets: typing.List[DatasetConfig] = field(default_factory=list)
|
||||
# inference: InferenceConfig = field(default_factory=InferenceConfig)
|
||||
|
||||
# version: int = 1
|
||||
# git_commit: str = ""
|
||||
|
||||
# @property
|
||||
# def is_multispeaker(self):
|
||||
# return self.model.is_multispeaker or any(d.multispeaker for d in self.datasets)
|
||||
|
||||
# def save(self, config_file: typing.TextIO):
|
||||
# """Save config as JSON to a file"""
|
||||
# json.dump(self.to_dict(), config_file, indent=4)
|
||||
|
||||
# def get_speaker_id(self, dataset_name: str, speaker_name: str) -> int:
|
||||
# if self.speaker_id_map is None:
|
||||
# self.speaker_id_map = {}
|
||||
|
||||
# full_speaker_name = f"{dataset_name}_{speaker_name}"
|
||||
# speaker_id = self.speaker_id_map.get(full_speaker_name)
|
||||
# if speaker_id is None:
|
||||
# speaker_id = len(self.speaker_id_map)
|
||||
# self.speaker_id_map[full_speaker_name] = speaker_id
|
||||
|
||||
# return speaker_id
|
||||
|
||||
# @staticmethod
|
||||
# def load(config_file: typing.TextIO) -> "TrainingConfig":
|
||||
# """Load config from a JSON file"""
|
||||
# return TrainingConfig.from_json(config_file.read())
|
||||
|
||||
# @staticmethod
|
||||
# def load_and_merge(
|
||||
# config: "TrainingConfig",
|
||||
# config_files: typing.Iterable[typing.Union[str, Path, typing.TextIO]],
|
||||
# ) -> "TrainingConfig":
|
||||
# """Loads one or more JSON configuration files and overlays them on top of an existing config"""
|
||||
# base_dict = config.to_dict()
|
||||
# for maybe_config_file in config_files:
|
||||
# if isinstance(maybe_config_file, (str, Path)):
|
||||
# # File path
|
||||
# config_file = open(maybe_config_file, "r", encoding="utf-8")
|
||||
# else:
|
||||
# # File object
|
||||
# config_file = maybe_config_file
|
||||
|
||||
# with config_file:
|
||||
# # Load new config and overlay on existing config
|
||||
# new_dict = json.load(config_file)
|
||||
# TrainingConfig.recursive_update(base_dict, new_dict)
|
||||
|
||||
# return TrainingConfig.from_dict(base_dict)
|
||||
|
||||
# @staticmethod
|
||||
# def recursive_update(
|
||||
# base_dict: typing.Dict[typing.Any, typing.Any],
|
||||
# new_dict: typing.Mapping[typing.Any, typing.Any],
|
||||
# ) -> None:
|
||||
# """Recursively overwrites values in base dictionary with values from new dictionary"""
|
||||
# for key, value in new_dict.items():
|
||||
# if isinstance(value, collections.Mapping) and (
|
||||
# base_dict.get(key) is not None
|
||||
# ):
|
||||
# TrainingConfig.recursive_update(base_dict[key], value)
|
||||
# else:
|
||||
# base_dict[key] = value
|
||||
214
mlu_370-piper/piper/src/python/piper_train/vits/dataset.py
Normal file
214
mlu_370-piper/piper/src/python/piper_train/vits/dataset.py
Normal file
@@ -0,0 +1,214 @@
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Sequence, Union
|
||||
|
||||
import torch
|
||||
from torch import FloatTensor, LongTensor
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
_LOGGER = logging.getLogger("vits.dataset")
|
||||
|
||||
|
||||
@dataclass
|
||||
class Utterance:
|
||||
phoneme_ids: List[int]
|
||||
audio_norm_path: Path
|
||||
audio_spec_path: Path
|
||||
speaker_id: Optional[int] = None
|
||||
text: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class UtteranceTensors:
|
||||
phoneme_ids: LongTensor
|
||||
spectrogram: FloatTensor
|
||||
audio_norm: FloatTensor
|
||||
speaker_id: Optional[LongTensor] = None
|
||||
text: Optional[str] = None
|
||||
|
||||
@property
|
||||
def spec_length(self) -> int:
|
||||
return self.spectrogram.size(1)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Batch:
|
||||
phoneme_ids: LongTensor
|
||||
phoneme_lengths: LongTensor
|
||||
spectrograms: FloatTensor
|
||||
spectrogram_lengths: LongTensor
|
||||
audios: FloatTensor
|
||||
audio_lengths: LongTensor
|
||||
speaker_ids: Optional[LongTensor] = None
|
||||
|
||||
|
||||
class PiperDataset(Dataset):
|
||||
"""
|
||||
Dataset format:
|
||||
|
||||
* phoneme_ids (required)
|
||||
* audio_norm_path (required)
|
||||
* audio_spec_path (required)
|
||||
* text (optional)
|
||||
* phonemes (optional)
|
||||
* audio_path (optional)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dataset_paths: List[Union[str, Path]],
|
||||
max_phoneme_ids: Optional[int] = None,
|
||||
):
|
||||
self.utterances: List[Utterance] = []
|
||||
|
||||
for dataset_path in dataset_paths:
|
||||
dataset_path = Path(dataset_path)
|
||||
_LOGGER.debug("Loading dataset: %s", dataset_path)
|
||||
self.utterances.extend(
|
||||
PiperDataset.load_dataset(dataset_path, max_phoneme_ids=max_phoneme_ids)
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.utterances)
|
||||
|
||||
def __getitem__(self, idx) -> UtteranceTensors:
|
||||
utt = self.utterances[idx]
|
||||
return UtteranceTensors(
|
||||
phoneme_ids=LongTensor(utt.phoneme_ids),
|
||||
audio_norm=torch.load(utt.audio_norm_path),
|
||||
spectrogram=torch.load(utt.audio_spec_path),
|
||||
speaker_id=LongTensor([utt.speaker_id])
|
||||
if utt.speaker_id is not None
|
||||
else None,
|
||||
text=utt.text,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def load_dataset(
|
||||
dataset_path: Path,
|
||||
max_phoneme_ids: Optional[int] = None,
|
||||
) -> Iterable[Utterance]:
|
||||
num_skipped = 0
|
||||
|
||||
with open(dataset_path, "r", encoding="utf-8") as dataset_file:
|
||||
for line_idx, line in enumerate(dataset_file):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
utt = PiperDataset.load_utterance(line)
|
||||
if (max_phoneme_ids is None) or (
|
||||
len(utt.phoneme_ids) <= max_phoneme_ids
|
||||
):
|
||||
yield utt
|
||||
else:
|
||||
num_skipped += 1
|
||||
except Exception:
|
||||
_LOGGER.exception(
|
||||
"Error on line %s of %s: %s",
|
||||
line_idx + 1,
|
||||
dataset_path,
|
||||
line,
|
||||
)
|
||||
|
||||
if num_skipped > 0:
|
||||
_LOGGER.warning("Skipped %s utterance(s)", num_skipped)
|
||||
|
||||
@staticmethod
|
||||
def load_utterance(line: str) -> Utterance:
|
||||
utt_dict = json.loads(line)
|
||||
return Utterance(
|
||||
phoneme_ids=utt_dict["phoneme_ids"],
|
||||
audio_norm_path=Path(utt_dict["audio_norm_path"]),
|
||||
audio_spec_path=Path(utt_dict["audio_spec_path"]),
|
||||
speaker_id=utt_dict.get("speaker_id"),
|
||||
text=utt_dict.get("text"),
|
||||
)
|
||||
|
||||
|
||||
class UtteranceCollate:
|
||||
def __init__(self, is_multispeaker: bool, segment_size: int):
|
||||
self.is_multispeaker = is_multispeaker
|
||||
self.segment_size = segment_size
|
||||
|
||||
def __call__(self, utterances: Sequence[UtteranceTensors]) -> Batch:
|
||||
num_utterances = len(utterances)
|
||||
assert num_utterances > 0, "No utterances"
|
||||
|
||||
max_phonemes_length = 0
|
||||
max_spec_length = 0
|
||||
max_audio_length = 0
|
||||
|
||||
num_mels = 0
|
||||
|
||||
# Determine lengths
|
||||
for utt_idx, utt in enumerate(utterances):
|
||||
assert utt.spectrogram is not None
|
||||
assert utt.audio_norm is not None
|
||||
|
||||
phoneme_length = utt.phoneme_ids.size(0)
|
||||
spec_length = utt.spectrogram.size(1)
|
||||
audio_length = utt.audio_norm.size(1)
|
||||
|
||||
max_phonemes_length = max(max_phonemes_length, phoneme_length)
|
||||
max_spec_length = max(max_spec_length, spec_length)
|
||||
max_audio_length = max(max_audio_length, audio_length)
|
||||
|
||||
num_mels = utt.spectrogram.size(0)
|
||||
if self.is_multispeaker:
|
||||
assert utt.speaker_id is not None, "Missing speaker id"
|
||||
|
||||
# Audio cannot be smaller than segment size (8192)
|
||||
max_audio_length = max(max_audio_length, self.segment_size)
|
||||
|
||||
# Create padded tensors
|
||||
phonemes_padded = LongTensor(num_utterances, max_phonemes_length)
|
||||
spec_padded = FloatTensor(num_utterances, num_mels, max_spec_length)
|
||||
audio_padded = FloatTensor(num_utterances, 1, max_audio_length)
|
||||
|
||||
phonemes_padded.zero_()
|
||||
spec_padded.zero_()
|
||||
audio_padded.zero_()
|
||||
|
||||
phoneme_lengths = LongTensor(num_utterances)
|
||||
spec_lengths = LongTensor(num_utterances)
|
||||
audio_lengths = LongTensor(num_utterances)
|
||||
|
||||
speaker_ids: Optional[LongTensor] = None
|
||||
if self.is_multispeaker:
|
||||
speaker_ids = LongTensor(num_utterances)
|
||||
|
||||
# Sort by decreasing spectrogram length
|
||||
sorted_utterances = sorted(
|
||||
utterances, key=lambda u: u.spectrogram.size(1), reverse=True
|
||||
)
|
||||
for utt_idx, utt in enumerate(sorted_utterances):
|
||||
phoneme_length = utt.phoneme_ids.size(0)
|
||||
spec_length = utt.spectrogram.size(1)
|
||||
audio_length = utt.audio_norm.size(1)
|
||||
|
||||
phonemes_padded[utt_idx, :phoneme_length] = utt.phoneme_ids
|
||||
phoneme_lengths[utt_idx] = phoneme_length
|
||||
|
||||
spec_padded[utt_idx, :, :spec_length] = utt.spectrogram
|
||||
spec_lengths[utt_idx] = spec_length
|
||||
|
||||
audio_padded[utt_idx, :, :audio_length] = utt.audio_norm
|
||||
audio_lengths[utt_idx] = audio_length
|
||||
|
||||
if utt.speaker_id is not None:
|
||||
assert speaker_ids is not None
|
||||
speaker_ids[utt_idx] = utt.speaker_id
|
||||
|
||||
return Batch(
|
||||
phoneme_ids=phonemes_padded,
|
||||
phoneme_lengths=phoneme_lengths,
|
||||
spectrograms=spec_padded,
|
||||
spectrogram_lengths=spec_lengths,
|
||||
audios=audio_padded,
|
||||
audio_lengths=audio_lengths,
|
||||
speaker_ids=speaker_ids,
|
||||
)
|
||||
352
mlu_370-piper/piper/src/python/piper_train/vits/lightning.py
Normal file
352
mlu_370-piper/piper/src/python/piper_train/vits/lightning.py
Normal file
@@ -0,0 +1,352 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import pytorch_lightning as pl
|
||||
import torch
|
||||
from torch import autocast
|
||||
from torch.nn import functional as F
|
||||
from torch.utils.data import DataLoader, Dataset, random_split
|
||||
|
||||
from .commons import slice_segments
|
||||
from .dataset import Batch, PiperDataset, UtteranceCollate
|
||||
from .losses import discriminator_loss, feature_loss, generator_loss, kl_loss
|
||||
from .mel_processing import mel_spectrogram_torch, spec_to_mel_torch
|
||||
from .models import MultiPeriodDiscriminator, SynthesizerTrn
|
||||
|
||||
_LOGGER = logging.getLogger("vits.lightning")
|
||||
|
||||
|
||||
class VitsModel(pl.LightningModule):
|
||||
def __init__(
|
||||
self,
|
||||
num_symbols: int,
|
||||
num_speakers: int,
|
||||
# audio
|
||||
resblock="2",
|
||||
resblock_kernel_sizes=(3, 5, 7),
|
||||
resblock_dilation_sizes=(
|
||||
(1, 2),
|
||||
(2, 6),
|
||||
(3, 12),
|
||||
),
|
||||
upsample_rates=(8, 8, 4),
|
||||
upsample_initial_channel=256,
|
||||
upsample_kernel_sizes=(16, 16, 8),
|
||||
# mel
|
||||
filter_length: int = 1024,
|
||||
hop_length: int = 256,
|
||||
win_length: int = 1024,
|
||||
mel_channels: int = 80,
|
||||
sample_rate: int = 22050,
|
||||
sample_bytes: int = 2,
|
||||
channels: int = 1,
|
||||
mel_fmin: float = 0.0,
|
||||
mel_fmax: Optional[float] = None,
|
||||
# model
|
||||
inter_channels: int = 192,
|
||||
hidden_channels: int = 192,
|
||||
filter_channels: int = 768,
|
||||
n_heads: int = 2,
|
||||
n_layers: int = 6,
|
||||
kernel_size: int = 3,
|
||||
p_dropout: float = 0.1,
|
||||
n_layers_q: int = 3,
|
||||
use_spectral_norm: bool = False,
|
||||
gin_channels: int = 0,
|
||||
use_sdp: bool = True,
|
||||
segment_size: int = 8192,
|
||||
# training
|
||||
dataset: Optional[List[Union[str, Path]]] = None,
|
||||
learning_rate: float = 2e-4,
|
||||
betas: Tuple[float, float] = (0.8, 0.99),
|
||||
eps: float = 1e-9,
|
||||
batch_size: int = 1,
|
||||
lr_decay: float = 0.999875,
|
||||
init_lr_ratio: float = 1.0,
|
||||
warmup_epochs: int = 0,
|
||||
c_mel: int = 45,
|
||||
c_kl: float = 1.0,
|
||||
grad_clip: Optional[float] = None,
|
||||
num_workers: int = 1,
|
||||
seed: int = 1234,
|
||||
num_test_examples: int = 5,
|
||||
validation_split: float = 0.1,
|
||||
max_phoneme_ids: Optional[int] = None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
self.save_hyperparameters()
|
||||
|
||||
if (self.hparams.num_speakers > 1) and (self.hparams.gin_channels <= 0):
|
||||
# Default gin_channels for multi-speaker model
|
||||
self.hparams.gin_channels = 512
|
||||
|
||||
# Set up models
|
||||
self.model_g = SynthesizerTrn(
|
||||
n_vocab=self.hparams.num_symbols,
|
||||
spec_channels=self.hparams.filter_length // 2 + 1,
|
||||
segment_size=self.hparams.segment_size // self.hparams.hop_length,
|
||||
inter_channels=self.hparams.inter_channels,
|
||||
hidden_channels=self.hparams.hidden_channels,
|
||||
filter_channels=self.hparams.filter_channels,
|
||||
n_heads=self.hparams.n_heads,
|
||||
n_layers=self.hparams.n_layers,
|
||||
kernel_size=self.hparams.kernel_size,
|
||||
p_dropout=self.hparams.p_dropout,
|
||||
resblock=self.hparams.resblock,
|
||||
resblock_kernel_sizes=self.hparams.resblock_kernel_sizes,
|
||||
resblock_dilation_sizes=self.hparams.resblock_dilation_sizes,
|
||||
upsample_rates=self.hparams.upsample_rates,
|
||||
upsample_initial_channel=self.hparams.upsample_initial_channel,
|
||||
upsample_kernel_sizes=self.hparams.upsample_kernel_sizes,
|
||||
n_speakers=self.hparams.num_speakers,
|
||||
gin_channels=self.hparams.gin_channels,
|
||||
use_sdp=self.hparams.use_sdp,
|
||||
)
|
||||
self.model_d = MultiPeriodDiscriminator(
|
||||
use_spectral_norm=self.hparams.use_spectral_norm
|
||||
)
|
||||
|
||||
# Dataset splits
|
||||
self._train_dataset: Optional[Dataset] = None
|
||||
self._val_dataset: Optional[Dataset] = None
|
||||
self._test_dataset: Optional[Dataset] = None
|
||||
self._load_datasets(validation_split, num_test_examples, max_phoneme_ids)
|
||||
|
||||
# State kept between training optimizers
|
||||
self._y = None
|
||||
self._y_hat = None
|
||||
|
||||
def _load_datasets(
|
||||
self,
|
||||
validation_split: float,
|
||||
num_test_examples: int,
|
||||
max_phoneme_ids: Optional[int] = None,
|
||||
):
|
||||
if self.hparams.dataset is None:
|
||||
_LOGGER.debug("No dataset to load")
|
||||
return
|
||||
|
||||
full_dataset = PiperDataset(
|
||||
self.hparams.dataset, max_phoneme_ids=max_phoneme_ids
|
||||
)
|
||||
valid_set_size = int(len(full_dataset) * validation_split)
|
||||
train_set_size = len(full_dataset) - valid_set_size - num_test_examples
|
||||
|
||||
self._train_dataset, self._test_dataset, self._val_dataset = random_split(
|
||||
full_dataset, [train_set_size, num_test_examples, valid_set_size]
|
||||
)
|
||||
|
||||
def forward(self, text, text_lengths, scales, sid=None):
|
||||
noise_scale = scales[0]
|
||||
length_scale = scales[1]
|
||||
noise_scale_w = scales[2]
|
||||
audio, *_ = self.model_g.infer(
|
||||
text,
|
||||
text_lengths,
|
||||
noise_scale=noise_scale,
|
||||
length_scale=length_scale,
|
||||
noise_scale_w=noise_scale_w,
|
||||
sid=sid,
|
||||
)
|
||||
|
||||
return audio
|
||||
|
||||
def train_dataloader(self):
|
||||
return DataLoader(
|
||||
self._train_dataset,
|
||||
collate_fn=UtteranceCollate(
|
||||
is_multispeaker=self.hparams.num_speakers > 1,
|
||||
segment_size=self.hparams.segment_size,
|
||||
),
|
||||
num_workers=self.hparams.num_workers,
|
||||
batch_size=self.hparams.batch_size,
|
||||
)
|
||||
|
||||
def val_dataloader(self):
|
||||
return DataLoader(
|
||||
self._val_dataset,
|
||||
collate_fn=UtteranceCollate(
|
||||
is_multispeaker=self.hparams.num_speakers > 1,
|
||||
segment_size=self.hparams.segment_size,
|
||||
),
|
||||
num_workers=self.hparams.num_workers,
|
||||
batch_size=self.hparams.batch_size,
|
||||
)
|
||||
|
||||
def test_dataloader(self):
|
||||
return DataLoader(
|
||||
self._test_dataset,
|
||||
collate_fn=UtteranceCollate(
|
||||
is_multispeaker=self.hparams.num_speakers > 1,
|
||||
segment_size=self.hparams.segment_size,
|
||||
),
|
||||
num_workers=self.hparams.num_workers,
|
||||
batch_size=self.hparams.batch_size,
|
||||
)
|
||||
|
||||
def training_step(self, batch: Batch, batch_idx: int, optimizer_idx: int):
|
||||
if optimizer_idx == 0:
|
||||
return self.training_step_g(batch)
|
||||
|
||||
if optimizer_idx == 1:
|
||||
return self.training_step_d(batch)
|
||||
|
||||
def training_step_g(self, batch: Batch):
|
||||
x, x_lengths, y, _, spec, spec_lengths, speaker_ids = (
|
||||
batch.phoneme_ids,
|
||||
batch.phoneme_lengths,
|
||||
batch.audios,
|
||||
batch.audio_lengths,
|
||||
batch.spectrograms,
|
||||
batch.spectrogram_lengths,
|
||||
batch.speaker_ids if batch.speaker_ids is not None else None,
|
||||
)
|
||||
(
|
||||
y_hat,
|
||||
l_length,
|
||||
_attn,
|
||||
ids_slice,
|
||||
_x_mask,
|
||||
z_mask,
|
||||
(_z, z_p, m_p, logs_p, _m_q, logs_q),
|
||||
) = self.model_g(x, x_lengths, spec, spec_lengths, speaker_ids)
|
||||
self._y_hat = y_hat
|
||||
|
||||
mel = spec_to_mel_torch(
|
||||
spec,
|
||||
self.hparams.filter_length,
|
||||
self.hparams.mel_channels,
|
||||
self.hparams.sample_rate,
|
||||
self.hparams.mel_fmin,
|
||||
self.hparams.mel_fmax,
|
||||
)
|
||||
y_mel = slice_segments(
|
||||
mel,
|
||||
ids_slice,
|
||||
self.hparams.segment_size // self.hparams.hop_length,
|
||||
)
|
||||
y_hat_mel = mel_spectrogram_torch(
|
||||
y_hat.squeeze(1),
|
||||
self.hparams.filter_length,
|
||||
self.hparams.mel_channels,
|
||||
self.hparams.sample_rate,
|
||||
self.hparams.hop_length,
|
||||
self.hparams.win_length,
|
||||
self.hparams.mel_fmin,
|
||||
self.hparams.mel_fmax,
|
||||
)
|
||||
y = slice_segments(
|
||||
y,
|
||||
ids_slice * self.hparams.hop_length,
|
||||
self.hparams.segment_size,
|
||||
) # slice
|
||||
|
||||
# Save for training_step_d
|
||||
self._y = y
|
||||
|
||||
_y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = self.model_d(y, y_hat)
|
||||
|
||||
with autocast(self.device.type, enabled=False):
|
||||
# Generator loss
|
||||
loss_dur = torch.sum(l_length.float())
|
||||
loss_mel = F.l1_loss(y_mel, y_hat_mel) * self.hparams.c_mel
|
||||
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * self.hparams.c_kl
|
||||
|
||||
loss_fm = feature_loss(fmap_r, fmap_g)
|
||||
loss_gen, _losses_gen = generator_loss(y_d_hat_g)
|
||||
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl
|
||||
|
||||
self.log("loss_gen_all", loss_gen_all)
|
||||
|
||||
return loss_gen_all
|
||||
|
||||
def training_step_d(self, batch: Batch):
|
||||
# From training_step_g
|
||||
y = self._y
|
||||
y_hat = self._y_hat
|
||||
y_d_hat_r, y_d_hat_g, _, _ = self.model_d(y, y_hat.detach())
|
||||
|
||||
with autocast(self.device.type, enabled=False):
|
||||
# Discriminator
|
||||
loss_disc, _losses_disc_r, _losses_disc_g = discriminator_loss(
|
||||
y_d_hat_r, y_d_hat_g
|
||||
)
|
||||
loss_disc_all = loss_disc
|
||||
|
||||
self.log("loss_disc_all", loss_disc_all)
|
||||
|
||||
return loss_disc_all
|
||||
|
||||
def validation_step(self, batch: Batch, batch_idx: int):
|
||||
val_loss = self.training_step_g(batch) + self.training_step_d(batch)
|
||||
self.log("val_loss", val_loss)
|
||||
|
||||
# Generate audio examples
|
||||
for utt_idx, test_utt in enumerate(self._test_dataset):
|
||||
text = test_utt.phoneme_ids.unsqueeze(0).to(self.device)
|
||||
text_lengths = torch.LongTensor([len(test_utt.phoneme_ids)]).to(self.device)
|
||||
scales = [0.667, 1.0, 0.8]
|
||||
sid = (
|
||||
test_utt.speaker_id.to(self.device)
|
||||
if test_utt.speaker_id is not None
|
||||
else None
|
||||
)
|
||||
test_audio = self(text, text_lengths, scales, sid=sid).detach()
|
||||
|
||||
# Scale to make louder in [-1, 1]
|
||||
test_audio = test_audio * (1.0 / max(0.01, abs(test_audio.max())))
|
||||
|
||||
tag = test_utt.text or str(utt_idx)
|
||||
self.logger.experiment.add_audio(
|
||||
tag, test_audio, sample_rate=self.hparams.sample_rate
|
||||
)
|
||||
|
||||
return val_loss
|
||||
|
||||
def configure_optimizers(self):
|
||||
optimizers = [
|
||||
torch.optim.AdamW(
|
||||
self.model_g.parameters(),
|
||||
lr=self.hparams.learning_rate,
|
||||
betas=self.hparams.betas,
|
||||
eps=self.hparams.eps,
|
||||
),
|
||||
torch.optim.AdamW(
|
||||
self.model_d.parameters(),
|
||||
lr=self.hparams.learning_rate,
|
||||
betas=self.hparams.betas,
|
||||
eps=self.hparams.eps,
|
||||
),
|
||||
]
|
||||
schedulers = [
|
||||
torch.optim.lr_scheduler.ExponentialLR(
|
||||
optimizers[0], gamma=self.hparams.lr_decay
|
||||
),
|
||||
torch.optim.lr_scheduler.ExponentialLR(
|
||||
optimizers[1], gamma=self.hparams.lr_decay
|
||||
),
|
||||
]
|
||||
|
||||
return optimizers, schedulers
|
||||
|
||||
@staticmethod
|
||||
def add_model_specific_args(parent_parser):
|
||||
parser = parent_parser.add_argument_group("VitsModel")
|
||||
parser.add_argument("--batch-size", type=int, required=True)
|
||||
parser.add_argument("--validation-split", type=float, default=0.1)
|
||||
parser.add_argument("--num-test-examples", type=int, default=5)
|
||||
parser.add_argument(
|
||||
"--max-phoneme-ids",
|
||||
type=int,
|
||||
help="Exclude utterances with phoneme id lists longer than this",
|
||||
)
|
||||
#
|
||||
parser.add_argument("--hidden-channels", type=int, default=192)
|
||||
parser.add_argument("--inter-channels", type=int, default=192)
|
||||
parser.add_argument("--filter-channels", type=int, default=768)
|
||||
parser.add_argument("--n-layers", type=int, default=6)
|
||||
parser.add_argument("--n-heads", type=int, default=2)
|
||||
#
|
||||
return parent_parser
|
||||
58
mlu_370-piper/piper/src/python/piper_train/vits/losses.py
Normal file
58
mlu_370-piper/piper/src/python/piper_train/vits/losses.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import torch
|
||||
|
||||
|
||||
def feature_loss(fmap_r, fmap_g):
|
||||
loss = 0
|
||||
for dr, dg in zip(fmap_r, fmap_g):
|
||||
for rl, gl in zip(dr, dg):
|
||||
rl = rl.float().detach()
|
||||
gl = gl.float()
|
||||
loss += torch.mean(torch.abs(rl - gl))
|
||||
|
||||
return loss * 2
|
||||
|
||||
|
||||
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
||||
loss = 0
|
||||
r_losses = []
|
||||
g_losses = []
|
||||
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
||||
dr = dr.float()
|
||||
dg = dg.float()
|
||||
r_loss = torch.mean((1 - dr) ** 2)
|
||||
g_loss = torch.mean(dg**2)
|
||||
loss += r_loss + g_loss
|
||||
r_losses.append(r_loss.item())
|
||||
g_losses.append(g_loss.item())
|
||||
|
||||
return loss, r_losses, g_losses
|
||||
|
||||
|
||||
def generator_loss(disc_outputs):
|
||||
loss = 0
|
||||
gen_losses = []
|
||||
for dg in disc_outputs:
|
||||
dg = dg.float()
|
||||
l_dg = torch.mean((1 - dg) ** 2)
|
||||
gen_losses.append(l_dg)
|
||||
loss += l_dg
|
||||
|
||||
return loss, gen_losses
|
||||
|
||||
|
||||
def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
|
||||
"""
|
||||
z_p, logs_q: [b, h, t_t]
|
||||
m_p, logs_p: [b, h, t_t]
|
||||
"""
|
||||
z_p = z_p.float()
|
||||
logs_q = logs_q.float()
|
||||
m_p = m_p.float()
|
||||
logs_p = logs_p.float()
|
||||
z_mask = z_mask.float()
|
||||
|
||||
kl = logs_p - logs_q - 0.5
|
||||
kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
|
||||
kl = torch.sum(kl * z_mask)
|
||||
l_kl = kl / torch.sum(z_mask)
|
||||
return l_kl
|
||||
@@ -0,0 +1,139 @@
|
||||
import torch
|
||||
import torch.utils.data
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
|
||||
MAX_WAV_VALUE = 32768.0
|
||||
|
||||
|
||||
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
||||
"""
|
||||
PARAMS
|
||||
------
|
||||
C: compression factor
|
||||
"""
|
||||
return torch.log(torch.clamp(x, min=clip_val) * C)
|
||||
|
||||
|
||||
def dynamic_range_decompression_torch(x, C=1):
|
||||
"""
|
||||
PARAMS
|
||||
------
|
||||
C: compression factor used to compress
|
||||
"""
|
||||
return torch.exp(x) / C
|
||||
|
||||
|
||||
def spectral_normalize_torch(magnitudes):
|
||||
output = dynamic_range_compression_torch(magnitudes)
|
||||
return output
|
||||
|
||||
|
||||
def spectral_de_normalize_torch(magnitudes):
|
||||
output = dynamic_range_decompression_torch(magnitudes)
|
||||
return output
|
||||
|
||||
|
||||
mel_basis = {}
|
||||
hann_window = {}
|
||||
|
||||
|
||||
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
|
||||
if torch.min(y) < -1.0:
|
||||
print("min value is ", torch.min(y))
|
||||
if torch.max(y) > 1.0:
|
||||
print("max value is ", torch.max(y))
|
||||
|
||||
global hann_window
|
||||
dtype_device = str(y.dtype) + "_" + str(y.device)
|
||||
wnsize_dtype_device = str(win_size) + "_" + dtype_device
|
||||
if wnsize_dtype_device not in hann_window:
|
||||
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).type_as(y)
|
||||
|
||||
y = torch.nn.functional.pad(
|
||||
y.unsqueeze(1),
|
||||
(int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
|
||||
mode="reflect",
|
||||
)
|
||||
y = y.squeeze(1)
|
||||
|
||||
spec = torch.view_as_real(
|
||||
torch.stft(
|
||||
y,
|
||||
n_fft,
|
||||
hop_length=hop_size,
|
||||
win_length=win_size,
|
||||
window=hann_window[wnsize_dtype_device],
|
||||
center=center,
|
||||
pad_mode="reflect",
|
||||
normalized=False,
|
||||
onesided=True,
|
||||
return_complex=True,
|
||||
)
|
||||
)
|
||||
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
||||
|
||||
return spec
|
||||
|
||||
|
||||
def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
|
||||
global mel_basis
|
||||
dtype_device = str(spec.dtype) + "_" + str(spec.device)
|
||||
fmax_dtype_device = str(fmax) + "_" + dtype_device
|
||||
if fmax_dtype_device not in mel_basis:
|
||||
mel = librosa_mel_fn(
|
||||
sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
|
||||
)
|
||||
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).type_as(spec)
|
||||
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
|
||||
spec = spectral_normalize_torch(spec)
|
||||
return spec
|
||||
|
||||
|
||||
def mel_spectrogram_torch(
|
||||
y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
|
||||
):
|
||||
if torch.min(y) < -1.0:
|
||||
print("min value is ", torch.min(y))
|
||||
if torch.max(y) > 1.0:
|
||||
print("max value is ", torch.max(y))
|
||||
|
||||
global mel_basis, hann_window
|
||||
dtype_device = str(y.dtype) + "_" + str(y.device)
|
||||
fmax_dtype_device = str(fmax) + "_" + dtype_device
|
||||
wnsize_dtype_device = str(win_size) + "_" + dtype_device
|
||||
if fmax_dtype_device not in mel_basis:
|
||||
mel = librosa_mel_fn(
|
||||
sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
|
||||
)
|
||||
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).type_as(y)
|
||||
if wnsize_dtype_device not in hann_window:
|
||||
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).type_as(y)
|
||||
|
||||
y = torch.nn.functional.pad(
|
||||
y.unsqueeze(1),
|
||||
(int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
|
||||
mode="reflect",
|
||||
)
|
||||
y = y.squeeze(1)
|
||||
spec = torch.view_as_real(
|
||||
torch.stft(
|
||||
y,
|
||||
n_fft,
|
||||
hop_length=hop_size,
|
||||
win_length=win_size,
|
||||
window=hann_window[wnsize_dtype_device],
|
||||
center=center,
|
||||
pad_mode="reflect",
|
||||
normalized=False,
|
||||
onesided=True,
|
||||
return_complex=True,
|
||||
)
|
||||
)
|
||||
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
||||
|
||||
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
|
||||
spec = spectral_normalize_torch(spec)
|
||||
|
||||
return spec
|
||||
732
mlu_370-piper/piper/src/python/piper_train/vits/models.py
Normal file
732
mlu_370-piper/piper/src/python/piper_train/vits/models.py
Normal file
@@ -0,0 +1,732 @@
|
||||
import math
|
||||
import typing
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import Conv1d, Conv2d, ConvTranspose1d
|
||||
from torch.nn import functional as F
|
||||
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
|
||||
|
||||
from . import attentions, commons, modules, monotonic_align
|
||||
from .commons import get_padding, init_weights
|
||||
|
||||
|
||||
class StochasticDurationPredictor(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
filter_channels: int,
|
||||
kernel_size: int,
|
||||
p_dropout: float,
|
||||
n_flows: int = 4,
|
||||
gin_channels: int = 0,
|
||||
):
|
||||
super().__init__()
|
||||
filter_channels = in_channels # it needs to be removed from future version.
|
||||
self.in_channels = in_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.n_flows = n_flows
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.log_flow = modules.Log()
|
||||
self.flows = nn.ModuleList()
|
||||
self.flows.append(modules.ElementwiseAffine(2))
|
||||
for i in range(n_flows):
|
||||
self.flows.append(
|
||||
modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
|
||||
)
|
||||
self.flows.append(modules.Flip())
|
||||
|
||||
self.post_pre = nn.Conv1d(1, filter_channels, 1)
|
||||
self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
||||
self.post_convs = modules.DDSConv(
|
||||
filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
|
||||
)
|
||||
self.post_flows = nn.ModuleList()
|
||||
self.post_flows.append(modules.ElementwiseAffine(2))
|
||||
for i in range(4):
|
||||
self.post_flows.append(
|
||||
modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
|
||||
)
|
||||
self.post_flows.append(modules.Flip())
|
||||
|
||||
self.pre = nn.Conv1d(in_channels, filter_channels, 1)
|
||||
self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
||||
self.convs = modules.DDSConv(
|
||||
filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
|
||||
)
|
||||
if gin_channels != 0:
|
||||
self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
|
||||
|
||||
def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
|
||||
x = torch.detach(x)
|
||||
x = self.pre(x)
|
||||
if g is not None:
|
||||
g = torch.detach(g)
|
||||
x = x + self.cond(g)
|
||||
x = self.convs(x, x_mask)
|
||||
x = self.proj(x) * x_mask
|
||||
|
||||
if not reverse:
|
||||
flows = self.flows
|
||||
assert w is not None
|
||||
|
||||
logdet_tot_q = 0
|
||||
h_w = self.post_pre(w)
|
||||
h_w = self.post_convs(h_w, x_mask)
|
||||
h_w = self.post_proj(h_w) * x_mask
|
||||
e_q = torch.randn(w.size(0), 2, w.size(2)).type_as(x) * x_mask
|
||||
z_q = e_q
|
||||
for flow in self.post_flows:
|
||||
z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
|
||||
logdet_tot_q += logdet_q
|
||||
z_u, z1 = torch.split(z_q, [1, 1], 1)
|
||||
u = torch.sigmoid(z_u) * x_mask
|
||||
z0 = (w - u) * x_mask
|
||||
logdet_tot_q += torch.sum(
|
||||
(F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
|
||||
)
|
||||
logq = (
|
||||
torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2])
|
||||
- logdet_tot_q
|
||||
)
|
||||
|
||||
logdet_tot = 0
|
||||
z0, logdet = self.log_flow(z0, x_mask)
|
||||
logdet_tot += logdet
|
||||
z = torch.cat([z0, z1], 1)
|
||||
for flow in flows:
|
||||
z, logdet = flow(z, x_mask, g=x, reverse=reverse)
|
||||
logdet_tot = logdet_tot + logdet
|
||||
nll = (
|
||||
torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2])
|
||||
- logdet_tot
|
||||
)
|
||||
return nll + logq # [b]
|
||||
else:
|
||||
flows = list(reversed(self.flows))
|
||||
flows = flows[:-2] + [flows[-1]] # remove a useless vflow
|
||||
z = torch.randn(x.size(0), 2, x.size(2)).type_as(x) * noise_scale
|
||||
|
||||
for flow in flows:
|
||||
z = flow(z, x_mask, g=x, reverse=reverse)
|
||||
z0, z1 = torch.split(z, [1, 1], 1)
|
||||
logw = z0
|
||||
return logw
|
||||
|
||||
|
||||
class DurationPredictor(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
filter_channels: int,
|
||||
kernel_size: int,
|
||||
p_dropout: float,
|
||||
gin_channels: int = 0,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.in_channels = in_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
self.conv_1 = nn.Conv1d(
|
||||
in_channels, filter_channels, kernel_size, padding=kernel_size // 2
|
||||
)
|
||||
self.norm_1 = modules.LayerNorm(filter_channels)
|
||||
self.conv_2 = nn.Conv1d(
|
||||
filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
|
||||
)
|
||||
self.norm_2 = modules.LayerNorm(filter_channels)
|
||||
self.proj = nn.Conv1d(filter_channels, 1, 1)
|
||||
|
||||
if gin_channels != 0:
|
||||
self.cond = nn.Conv1d(gin_channels, in_channels, 1)
|
||||
|
||||
def forward(self, x, x_mask, g=None):
|
||||
x = torch.detach(x)
|
||||
if g is not None:
|
||||
g = torch.detach(g)
|
||||
x = x + self.cond(g)
|
||||
x = self.conv_1(x * x_mask)
|
||||
x = torch.relu(x)
|
||||
x = self.norm_1(x)
|
||||
x = self.drop(x)
|
||||
x = self.conv_2(x * x_mask)
|
||||
x = torch.relu(x)
|
||||
x = self.norm_2(x)
|
||||
x = self.drop(x)
|
||||
x = self.proj(x * x_mask)
|
||||
return x * x_mask
|
||||
|
||||
|
||||
class TextEncoder(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
n_vocab: int,
|
||||
out_channels: int,
|
||||
hidden_channels: int,
|
||||
filter_channels: int,
|
||||
n_heads: int,
|
||||
n_layers: int,
|
||||
kernel_size: int,
|
||||
p_dropout: float,
|
||||
):
|
||||
super().__init__()
|
||||
self.n_vocab = n_vocab
|
||||
self.out_channels = out_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
|
||||
self.emb = nn.Embedding(n_vocab, hidden_channels)
|
||||
nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
|
||||
|
||||
self.encoder = attentions.Encoder(
|
||||
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
|
||||
)
|
||||
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
||||
|
||||
def forward(self, x, x_lengths):
|
||||
x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
|
||||
x = torch.transpose(x, 1, -1) # [b, h, t]
|
||||
x_mask = torch.unsqueeze(
|
||||
commons.sequence_mask(x_lengths, x.size(2)), 1
|
||||
).type_as(x)
|
||||
|
||||
x = self.encoder(x * x_mask, x_mask)
|
||||
stats = self.proj(x) * x_mask
|
||||
|
||||
m, logs = torch.split(stats, self.out_channels, dim=1)
|
||||
return x, m, logs, x_mask
|
||||
|
||||
|
||||
class ResidualCouplingBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
channels: int,
|
||||
hidden_channels: int,
|
||||
kernel_size: int,
|
||||
dilation_rate: int,
|
||||
n_layers: int,
|
||||
n_flows: int = 4,
|
||||
gin_channels: int = 0,
|
||||
):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.n_flows = n_flows
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.flows = nn.ModuleList()
|
||||
for i in range(n_flows):
|
||||
self.flows.append(
|
||||
modules.ResidualCouplingLayer(
|
||||
channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dilation_rate,
|
||||
n_layers,
|
||||
gin_channels=gin_channels,
|
||||
mean_only=True,
|
||||
)
|
||||
)
|
||||
self.flows.append(modules.Flip())
|
||||
|
||||
def forward(self, x, x_mask, g=None, reverse=False):
|
||||
if not reverse:
|
||||
for flow in self.flows:
|
||||
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
||||
else:
|
||||
for flow in reversed(self.flows):
|
||||
x = flow(x, x_mask, g=g, reverse=reverse)
|
||||
return x
|
||||
|
||||
|
||||
class PosteriorEncoder(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
out_channels: int,
|
||||
hidden_channels: int,
|
||||
kernel_size: int,
|
||||
dilation_rate: int,
|
||||
n_layers: int,
|
||||
gin_channels: int = 0,
|
||||
):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
||||
self.enc = modules.WN(
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dilation_rate,
|
||||
n_layers,
|
||||
gin_channels=gin_channels,
|
||||
)
|
||||
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
||||
|
||||
def forward(self, x, x_lengths, g=None):
|
||||
x_mask = torch.unsqueeze(
|
||||
commons.sequence_mask(x_lengths, x.size(2)), 1
|
||||
).type_as(x)
|
||||
x = self.pre(x) * x_mask
|
||||
x = self.enc(x, x_mask, g=g)
|
||||
stats = self.proj(x) * x_mask
|
||||
m, logs = torch.split(stats, self.out_channels, dim=1)
|
||||
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
||||
return z, m, logs, x_mask
|
||||
|
||||
|
||||
class Generator(torch.nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
initial_channel: int,
|
||||
resblock: typing.Optional[str],
|
||||
resblock_kernel_sizes: typing.Tuple[int, ...],
|
||||
resblock_dilation_sizes: typing.Tuple[typing.Tuple[int, ...], ...],
|
||||
upsample_rates: typing.Tuple[int, ...],
|
||||
upsample_initial_channel: int,
|
||||
upsample_kernel_sizes: typing.Tuple[int, ...],
|
||||
gin_channels: int = 0,
|
||||
):
|
||||
super(Generator, self).__init__()
|
||||
self.LRELU_SLOPE = 0.1
|
||||
self.num_kernels = len(resblock_kernel_sizes)
|
||||
self.num_upsamples = len(upsample_rates)
|
||||
self.conv_pre = Conv1d(
|
||||
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
||||
)
|
||||
resblock_module = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
|
||||
|
||||
self.ups = nn.ModuleList()
|
||||
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
||||
self.ups.append(
|
||||
weight_norm(
|
||||
ConvTranspose1d(
|
||||
upsample_initial_channel // (2**i),
|
||||
upsample_initial_channel // (2 ** (i + 1)),
|
||||
k,
|
||||
u,
|
||||
padding=(k - u) // 2,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
self.resblocks = nn.ModuleList()
|
||||
for i in range(len(self.ups)):
|
||||
ch = upsample_initial_channel // (2 ** (i + 1))
|
||||
for j, (k, d) in enumerate(
|
||||
zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
||||
):
|
||||
self.resblocks.append(resblock_module(ch, k, d))
|
||||
|
||||
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
||||
self.ups.apply(init_weights)
|
||||
|
||||
if gin_channels != 0:
|
||||
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
||||
|
||||
def forward(self, x, g=None):
|
||||
x = self.conv_pre(x)
|
||||
if g is not None:
|
||||
x = x + self.cond(g)
|
||||
|
||||
for i, up in enumerate(self.ups):
|
||||
x = F.leaky_relu(x, self.LRELU_SLOPE)
|
||||
x = up(x)
|
||||
xs = torch.zeros(1)
|
||||
for j, resblock in enumerate(self.resblocks):
|
||||
index = j - (i * self.num_kernels)
|
||||
if index == 0:
|
||||
xs = resblock(x)
|
||||
elif (index > 0) and (index < self.num_kernels):
|
||||
xs += resblock(x)
|
||||
x = xs / self.num_kernels
|
||||
x = F.leaky_relu(x)
|
||||
x = self.conv_post(x)
|
||||
x = torch.tanh(x)
|
||||
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
print("Removing weight norm...")
|
||||
for l in self.ups:
|
||||
remove_weight_norm(l)
|
||||
for l in self.resblocks:
|
||||
l.remove_weight_norm()
|
||||
|
||||
|
||||
class DiscriminatorP(torch.nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
period: int,
|
||||
kernel_size: int = 5,
|
||||
stride: int = 3,
|
||||
use_spectral_norm: bool = False,
|
||||
):
|
||||
super(DiscriminatorP, self).__init__()
|
||||
self.LRELU_SLOPE = 0.1
|
||||
self.period = period
|
||||
self.use_spectral_norm = use_spectral_norm
|
||||
norm_f = weight_norm if not use_spectral_norm else spectral_norm
|
||||
self.convs = nn.ModuleList(
|
||||
[
|
||||
norm_f(
|
||||
Conv2d(
|
||||
1,
|
||||
32,
|
||||
(kernel_size, 1),
|
||||
(stride, 1),
|
||||
padding=(get_padding(kernel_size, 1), 0),
|
||||
)
|
||||
),
|
||||
norm_f(
|
||||
Conv2d(
|
||||
32,
|
||||
128,
|
||||
(kernel_size, 1),
|
||||
(stride, 1),
|
||||
padding=(get_padding(kernel_size, 1), 0),
|
||||
)
|
||||
),
|
||||
norm_f(
|
||||
Conv2d(
|
||||
128,
|
||||
512,
|
||||
(kernel_size, 1),
|
||||
(stride, 1),
|
||||
padding=(get_padding(kernel_size, 1), 0),
|
||||
)
|
||||
),
|
||||
norm_f(
|
||||
Conv2d(
|
||||
512,
|
||||
1024,
|
||||
(kernel_size, 1),
|
||||
(stride, 1),
|
||||
padding=(get_padding(kernel_size, 1), 0),
|
||||
)
|
||||
),
|
||||
norm_f(
|
||||
Conv2d(
|
||||
1024,
|
||||
1024,
|
||||
(kernel_size, 1),
|
||||
1,
|
||||
padding=(get_padding(kernel_size, 1), 0),
|
||||
)
|
||||
),
|
||||
]
|
||||
)
|
||||
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
|
||||
# 1d to 2d
|
||||
b, c, t = x.shape
|
||||
if t % self.period != 0: # pad first
|
||||
n_pad = self.period - (t % self.period)
|
||||
x = F.pad(x, (0, n_pad), "reflect")
|
||||
t = t + n_pad
|
||||
x = x.view(b, c, t // self.period, self.period)
|
||||
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, self.LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class DiscriminatorS(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(DiscriminatorS, self).__init__()
|
||||
self.LRELU_SLOPE = 0.1
|
||||
norm_f = spectral_norm if use_spectral_norm else weight_norm
|
||||
self.convs = nn.ModuleList(
|
||||
[
|
||||
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
||||
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
||||
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
||||
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
||||
]
|
||||
)
|
||||
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, self.LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class MultiPeriodDiscriminator(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(MultiPeriodDiscriminator, self).__init__()
|
||||
periods = [2, 3, 5, 7, 11]
|
||||
|
||||
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
||||
discs = discs + [
|
||||
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
|
||||
]
|
||||
self.discriminators = nn.ModuleList(discs)
|
||||
|
||||
def forward(self, y, y_hat):
|
||||
y_d_rs = []
|
||||
y_d_gs = []
|
||||
fmap_rs = []
|
||||
fmap_gs = []
|
||||
for i, d in enumerate(self.discriminators):
|
||||
y_d_r, fmap_r = d(y)
|
||||
y_d_g, fmap_g = d(y_hat)
|
||||
y_d_rs.append(y_d_r)
|
||||
y_d_gs.append(y_d_g)
|
||||
fmap_rs.append(fmap_r)
|
||||
fmap_gs.append(fmap_g)
|
||||
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
class SynthesizerTrn(nn.Module):
|
||||
"""
|
||||
Synthesizer for Training
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_vocab: int,
|
||||
spec_channels: int,
|
||||
segment_size: int,
|
||||
inter_channels: int,
|
||||
hidden_channels: int,
|
||||
filter_channels: int,
|
||||
n_heads: int,
|
||||
n_layers: int,
|
||||
kernel_size: int,
|
||||
p_dropout: float,
|
||||
resblock: str,
|
||||
resblock_kernel_sizes: typing.Tuple[int, ...],
|
||||
resblock_dilation_sizes: typing.Tuple[typing.Tuple[int, ...], ...],
|
||||
upsample_rates: typing.Tuple[int, ...],
|
||||
upsample_initial_channel: int,
|
||||
upsample_kernel_sizes: typing.Tuple[int, ...],
|
||||
n_speakers: int = 1,
|
||||
gin_channels: int = 0,
|
||||
use_sdp: bool = True,
|
||||
):
|
||||
|
||||
super().__init__()
|
||||
self.n_vocab = n_vocab
|
||||
self.spec_channels = spec_channels
|
||||
self.inter_channels = inter_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.resblock = resblock
|
||||
self.resblock_kernel_sizes = resblock_kernel_sizes
|
||||
self.resblock_dilation_sizes = resblock_dilation_sizes
|
||||
self.upsample_rates = upsample_rates
|
||||
self.upsample_initial_channel = upsample_initial_channel
|
||||
self.upsample_kernel_sizes = upsample_kernel_sizes
|
||||
self.segment_size = segment_size
|
||||
self.n_speakers = n_speakers
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.use_sdp = use_sdp
|
||||
|
||||
self.enc_p = TextEncoder(
|
||||
n_vocab,
|
||||
inter_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout,
|
||||
)
|
||||
self.dec = Generator(
|
||||
inter_channels,
|
||||
resblock,
|
||||
resblock_kernel_sizes,
|
||||
resblock_dilation_sizes,
|
||||
upsample_rates,
|
||||
upsample_initial_channel,
|
||||
upsample_kernel_sizes,
|
||||
gin_channels=gin_channels,
|
||||
)
|
||||
self.enc_q = PosteriorEncoder(
|
||||
spec_channels,
|
||||
inter_channels,
|
||||
hidden_channels,
|
||||
5,
|
||||
1,
|
||||
16,
|
||||
gin_channels=gin_channels,
|
||||
)
|
||||
self.flow = ResidualCouplingBlock(
|
||||
inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels
|
||||
)
|
||||
|
||||
if use_sdp:
|
||||
self.dp = StochasticDurationPredictor(
|
||||
hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
|
||||
)
|
||||
else:
|
||||
self.dp = DurationPredictor(
|
||||
hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
|
||||
)
|
||||
|
||||
if n_speakers > 1:
|
||||
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
||||
|
||||
def forward(self, x, x_lengths, y, y_lengths, sid=None):
|
||||
|
||||
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
||||
if self.n_speakers > 1:
|
||||
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
||||
else:
|
||||
g = None
|
||||
|
||||
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
||||
z_p = self.flow(z, y_mask, g=g)
|
||||
|
||||
with torch.no_grad():
|
||||
# negative cross-entropy
|
||||
s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
|
||||
neg_cent1 = torch.sum(
|
||||
-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True
|
||||
) # [b, 1, t_s]
|
||||
neg_cent2 = torch.matmul(
|
||||
-0.5 * (z_p**2).transpose(1, 2), s_p_sq_r
|
||||
) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
|
||||
neg_cent3 = torch.matmul(
|
||||
z_p.transpose(1, 2), (m_p * s_p_sq_r)
|
||||
) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
|
||||
neg_cent4 = torch.sum(
|
||||
-0.5 * (m_p**2) * s_p_sq_r, [1], keepdim=True
|
||||
) # [b, 1, t_s]
|
||||
neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
|
||||
|
||||
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
||||
attn = (
|
||||
monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1))
|
||||
.unsqueeze(1)
|
||||
.detach()
|
||||
)
|
||||
|
||||
w = attn.sum(2)
|
||||
if self.use_sdp:
|
||||
l_length = self.dp(x, x_mask, w, g=g)
|
||||
l_length = l_length / torch.sum(x_mask)
|
||||
else:
|
||||
logw_ = torch.log(w + 1e-6) * x_mask
|
||||
logw = self.dp(x, x_mask, g=g)
|
||||
l_length = torch.sum((logw - logw_) ** 2, [1, 2]) / torch.sum(
|
||||
x_mask
|
||||
) # for averaging
|
||||
|
||||
# expand prior
|
||||
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
|
||||
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
|
||||
|
||||
z_slice, ids_slice = commons.rand_slice_segments(
|
||||
z, y_lengths, self.segment_size
|
||||
)
|
||||
o = self.dec(z_slice, g=g)
|
||||
return (
|
||||
o,
|
||||
l_length,
|
||||
attn,
|
||||
ids_slice,
|
||||
x_mask,
|
||||
y_mask,
|
||||
(z, z_p, m_p, logs_p, m_q, logs_q),
|
||||
)
|
||||
|
||||
def infer(
|
||||
self,
|
||||
x,
|
||||
x_lengths,
|
||||
sid=None,
|
||||
noise_scale=0.667,
|
||||
length_scale=1,
|
||||
noise_scale_w=0.8,
|
||||
max_len=None,
|
||||
):
|
||||
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
||||
if self.n_speakers > 1:
|
||||
assert sid is not None, "Missing speaker id"
|
||||
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
||||
else:
|
||||
g = None
|
||||
|
||||
if self.use_sdp:
|
||||
logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
||||
else:
|
||||
logw = self.dp(x, x_mask, g=g)
|
||||
w = torch.exp(logw) * x_mask * length_scale
|
||||
w_ceil = torch.ceil(w)
|
||||
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
|
||||
y_mask = torch.unsqueeze(
|
||||
commons.sequence_mask(y_lengths, y_lengths.max()), 1
|
||||
).type_as(x_mask)
|
||||
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
||||
attn = commons.generate_path(w_ceil, attn_mask)
|
||||
|
||||
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
|
||||
1, 2
|
||||
) # [b, t', t], [b, t, d] -> [b, d, t']
|
||||
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
|
||||
1, 2
|
||||
) # [b, t', t], [b, t, d] -> [b, d, t']
|
||||
|
||||
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
||||
z = self.flow(z_p, y_mask, g=g, reverse=True)
|
||||
o = self.dec((z * y_mask)[:, :, :max_len], g=g)
|
||||
|
||||
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
||||
|
||||
def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
|
||||
assert self.n_speakers > 1, "n_speakers have to be larger than 1."
|
||||
g_src = self.emb_g(sid_src).unsqueeze(-1)
|
||||
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
|
||||
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
|
||||
z_p = self.flow(z, y_mask, g=g_src)
|
||||
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
||||
o_hat = self.dec(z_hat * y_mask, g=g_tgt)
|
||||
return o_hat, y_mask, (z, z_p, z_hat)
|
||||
527
mlu_370-piper/piper/src/python/piper_train/vits/modules.py
Normal file
527
mlu_370-piper/piper/src/python/piper_train/vits/modules.py
Normal file
@@ -0,0 +1,527 @@
|
||||
import math
|
||||
import typing
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import Conv1d
|
||||
from torch.nn import functional as F
|
||||
from torch.nn.utils import remove_weight_norm, weight_norm
|
||||
|
||||
from .commons import fused_add_tanh_sigmoid_multiply, get_padding, init_weights
|
||||
from .transforms import piecewise_rational_quadratic_transform
|
||||
|
||||
|
||||
class LayerNorm(nn.Module):
|
||||
def __init__(self, channels: int, eps: float = 1e-5):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.eps = eps
|
||||
|
||||
self.gamma = nn.Parameter(torch.ones(channels))
|
||||
self.beta = nn.Parameter(torch.zeros(channels))
|
||||
|
||||
def forward(self, x):
|
||||
x = x.transpose(1, -1)
|
||||
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
||||
return x.transpose(1, -1)
|
||||
|
||||
|
||||
class ConvReluNorm(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
hidden_channels: int,
|
||||
out_channels: int,
|
||||
kernel_size: int,
|
||||
n_layers: int,
|
||||
p_dropout: float,
|
||||
):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.out_channels = out_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.n_layers = n_layers
|
||||
self.p_dropout = p_dropout
|
||||
assert n_layers > 1, "Number of layers should be larger than 0."
|
||||
|
||||
self.conv_layers = nn.ModuleList()
|
||||
self.norm_layers = nn.ModuleList()
|
||||
self.conv_layers.append(
|
||||
nn.Conv1d(
|
||||
in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
|
||||
)
|
||||
)
|
||||
self.norm_layers.append(LayerNorm(hidden_channels))
|
||||
self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
|
||||
for _ in range(n_layers - 1):
|
||||
self.conv_layers.append(
|
||||
nn.Conv1d(
|
||||
hidden_channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
padding=kernel_size // 2,
|
||||
)
|
||||
)
|
||||
self.norm_layers.append(LayerNorm(hidden_channels))
|
||||
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
||||
self.proj.weight.data.zero_()
|
||||
self.proj.bias.data.zero_()
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
x_org = x
|
||||
for i in range(self.n_layers):
|
||||
x = self.conv_layers[i](x * x_mask)
|
||||
x = self.norm_layers[i](x)
|
||||
x = self.relu_drop(x)
|
||||
x = x_org + self.proj(x)
|
||||
return x * x_mask
|
||||
|
||||
|
||||
class DDSConv(nn.Module):
|
||||
"""
|
||||
Dialted and Depth-Separable Convolution
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, channels: int, kernel_size: int, n_layers: int, p_dropout: float = 0.0
|
||||
):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.kernel_size = kernel_size
|
||||
self.n_layers = n_layers
|
||||
self.p_dropout = p_dropout
|
||||
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
self.convs_sep = nn.ModuleList()
|
||||
self.convs_1x1 = nn.ModuleList()
|
||||
self.norms_1 = nn.ModuleList()
|
||||
self.norms_2 = nn.ModuleList()
|
||||
for i in range(n_layers):
|
||||
dilation = kernel_size**i
|
||||
padding = (kernel_size * dilation - dilation) // 2
|
||||
self.convs_sep.append(
|
||||
nn.Conv1d(
|
||||
channels,
|
||||
channels,
|
||||
kernel_size,
|
||||
groups=channels,
|
||||
dilation=dilation,
|
||||
padding=padding,
|
||||
)
|
||||
)
|
||||
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
|
||||
self.norms_1.append(LayerNorm(channels))
|
||||
self.norms_2.append(LayerNorm(channels))
|
||||
|
||||
def forward(self, x, x_mask, g=None):
|
||||
if g is not None:
|
||||
x = x + g
|
||||
for i in range(self.n_layers):
|
||||
y = self.convs_sep[i](x * x_mask)
|
||||
y = self.norms_1[i](y)
|
||||
y = F.gelu(y)
|
||||
y = self.convs_1x1[i](y)
|
||||
y = self.norms_2[i](y)
|
||||
y = F.gelu(y)
|
||||
y = self.drop(y)
|
||||
x = x + y
|
||||
return x * x_mask
|
||||
|
||||
|
||||
class WN(torch.nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
hidden_channels: int,
|
||||
kernel_size: int,
|
||||
dilation_rate: int,
|
||||
n_layers: int,
|
||||
gin_channels: int = 0,
|
||||
p_dropout: float = 0,
|
||||
):
|
||||
super().__init__()
|
||||
assert kernel_size % 2 == 1
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = (kernel_size,)
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.gin_channels = gin_channels
|
||||
self.p_dropout = p_dropout
|
||||
|
||||
self.in_layers = torch.nn.ModuleList()
|
||||
self.res_skip_layers = torch.nn.ModuleList()
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
|
||||
if gin_channels != 0:
|
||||
cond_layer = torch.nn.Conv1d(
|
||||
gin_channels, 2 * hidden_channels * n_layers, 1
|
||||
)
|
||||
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
|
||||
|
||||
for i in range(n_layers):
|
||||
dilation = dilation_rate**i
|
||||
padding = int((kernel_size * dilation - dilation) / 2)
|
||||
in_layer = torch.nn.Conv1d(
|
||||
hidden_channels,
|
||||
2 * hidden_channels,
|
||||
kernel_size,
|
||||
dilation=dilation,
|
||||
padding=padding,
|
||||
)
|
||||
in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
|
||||
self.in_layers.append(in_layer)
|
||||
|
||||
# last one is not necessary
|
||||
if i < n_layers - 1:
|
||||
res_skip_channels = 2 * hidden_channels
|
||||
else:
|
||||
res_skip_channels = hidden_channels
|
||||
|
||||
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
|
||||
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
|
||||
self.res_skip_layers.append(res_skip_layer)
|
||||
|
||||
def forward(self, x, x_mask, g=None, **kwargs):
|
||||
output = torch.zeros_like(x)
|
||||
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
||||
|
||||
if g is not None:
|
||||
g = self.cond_layer(g)
|
||||
|
||||
for i in range(self.n_layers):
|
||||
x_in = self.in_layers[i](x)
|
||||
if g is not None:
|
||||
cond_offset = i * 2 * self.hidden_channels
|
||||
g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
|
||||
else:
|
||||
g_l = torch.zeros_like(x_in)
|
||||
|
||||
acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
|
||||
acts = self.drop(acts)
|
||||
|
||||
res_skip_acts = self.res_skip_layers[i](acts)
|
||||
if i < self.n_layers - 1:
|
||||
res_acts = res_skip_acts[:, : self.hidden_channels, :]
|
||||
x = (x + res_acts) * x_mask
|
||||
output = output + res_skip_acts[:, self.hidden_channels :, :]
|
||||
else:
|
||||
output = output + res_skip_acts
|
||||
return output * x_mask
|
||||
|
||||
def remove_weight_norm(self):
|
||||
if self.gin_channels != 0:
|
||||
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
||||
for l in self.in_layers:
|
||||
torch.nn.utils.remove_weight_norm(l)
|
||||
for l in self.res_skip_layers:
|
||||
torch.nn.utils.remove_weight_norm(l)
|
||||
|
||||
|
||||
class ResBlock1(torch.nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
channels: int,
|
||||
kernel_size: int = 3,
|
||||
dilation: typing.Tuple[int] = (1, 3, 5),
|
||||
):
|
||||
super(ResBlock1, self).__init__()
|
||||
self.LRELU_SLOPE = 0.1
|
||||
self.convs1 = nn.ModuleList(
|
||||
[
|
||||
weight_norm(
|
||||
Conv1d(
|
||||
channels,
|
||||
channels,
|
||||
kernel_size,
|
||||
1,
|
||||
dilation=dilation[0],
|
||||
padding=get_padding(kernel_size, dilation[0]),
|
||||
)
|
||||
),
|
||||
weight_norm(
|
||||
Conv1d(
|
||||
channels,
|
||||
channels,
|
||||
kernel_size,
|
||||
1,
|
||||
dilation=dilation[1],
|
||||
padding=get_padding(kernel_size, dilation[1]),
|
||||
)
|
||||
),
|
||||
weight_norm(
|
||||
Conv1d(
|
||||
channels,
|
||||
channels,
|
||||
kernel_size,
|
||||
1,
|
||||
dilation=dilation[2],
|
||||
padding=get_padding(kernel_size, dilation[2]),
|
||||
)
|
||||
),
|
||||
]
|
||||
)
|
||||
self.convs1.apply(init_weights)
|
||||
|
||||
self.convs2 = nn.ModuleList(
|
||||
[
|
||||
weight_norm(
|
||||
Conv1d(
|
||||
channels,
|
||||
channels,
|
||||
kernel_size,
|
||||
1,
|
||||
dilation=1,
|
||||
padding=get_padding(kernel_size, 1),
|
||||
)
|
||||
),
|
||||
weight_norm(
|
||||
Conv1d(
|
||||
channels,
|
||||
channels,
|
||||
kernel_size,
|
||||
1,
|
||||
dilation=1,
|
||||
padding=get_padding(kernel_size, 1),
|
||||
)
|
||||
),
|
||||
weight_norm(
|
||||
Conv1d(
|
||||
channels,
|
||||
channels,
|
||||
kernel_size,
|
||||
1,
|
||||
dilation=1,
|
||||
padding=get_padding(kernel_size, 1),
|
||||
)
|
||||
),
|
||||
]
|
||||
)
|
||||
self.convs2.apply(init_weights)
|
||||
|
||||
def forward(self, x, x_mask=None):
|
||||
for c1, c2 in zip(self.convs1, self.convs2):
|
||||
xt = F.leaky_relu(x, self.LRELU_SLOPE)
|
||||
if x_mask is not None:
|
||||
xt = xt * x_mask
|
||||
xt = c1(xt)
|
||||
xt = F.leaky_relu(xt, self.LRELU_SLOPE)
|
||||
if x_mask is not None:
|
||||
xt = xt * x_mask
|
||||
xt = c2(xt)
|
||||
x = xt + x
|
||||
if x_mask is not None:
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.convs1:
|
||||
remove_weight_norm(l)
|
||||
for l in self.convs2:
|
||||
remove_weight_norm(l)
|
||||
|
||||
|
||||
class ResBlock2(torch.nn.Module):
|
||||
def __init__(
|
||||
self, channels: int, kernel_size: int = 3, dilation: typing.Tuple[int] = (1, 3)
|
||||
):
|
||||
super(ResBlock2, self).__init__()
|
||||
self.LRELU_SLOPE = 0.1
|
||||
self.convs = nn.ModuleList(
|
||||
[
|
||||
weight_norm(
|
||||
Conv1d(
|
||||
channels,
|
||||
channels,
|
||||
kernel_size,
|
||||
1,
|
||||
dilation=dilation[0],
|
||||
padding=get_padding(kernel_size, dilation[0]),
|
||||
)
|
||||
),
|
||||
weight_norm(
|
||||
Conv1d(
|
||||
channels,
|
||||
channels,
|
||||
kernel_size,
|
||||
1,
|
||||
dilation=dilation[1],
|
||||
padding=get_padding(kernel_size, dilation[1]),
|
||||
)
|
||||
),
|
||||
]
|
||||
)
|
||||
self.convs.apply(init_weights)
|
||||
|
||||
def forward(self, x, x_mask=None):
|
||||
for c in self.convs:
|
||||
xt = F.leaky_relu(x, self.LRELU_SLOPE)
|
||||
if x_mask is not None:
|
||||
xt = xt * x_mask
|
||||
xt = c(xt)
|
||||
x = xt + x
|
||||
if x_mask is not None:
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.convs:
|
||||
remove_weight_norm(l)
|
||||
|
||||
|
||||
class Log(nn.Module):
|
||||
def forward(
|
||||
self, x: torch.Tensor, x_mask: torch.Tensor, reverse: bool = False, **kwargs
|
||||
):
|
||||
if not reverse:
|
||||
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
|
||||
logdet = torch.sum(-y, [1, 2])
|
||||
return y, logdet
|
||||
else:
|
||||
x = torch.exp(x) * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class Flip(nn.Module):
|
||||
def forward(self, x: torch.Tensor, *args, reverse: bool = False, **kwargs):
|
||||
x = torch.flip(x, [1])
|
||||
if not reverse:
|
||||
logdet = torch.zeros(x.size(0)).type_as(x)
|
||||
return x, logdet
|
||||
else:
|
||||
return x
|
||||
|
||||
|
||||
class ElementwiseAffine(nn.Module):
|
||||
def __init__(self, channels: int):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.m = nn.Parameter(torch.zeros(channels, 1))
|
||||
self.logs = nn.Parameter(torch.zeros(channels, 1))
|
||||
|
||||
def forward(self, x, x_mask, reverse=False, **kwargs):
|
||||
if not reverse:
|
||||
y = self.m + torch.exp(self.logs) * x
|
||||
y = y * x_mask
|
||||
logdet = torch.sum(self.logs * x_mask, [1, 2])
|
||||
return y, logdet
|
||||
else:
|
||||
x = (x - self.m) * torch.exp(-self.logs) * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class ResidualCouplingLayer(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
channels: int,
|
||||
hidden_channels: int,
|
||||
kernel_size: int,
|
||||
dilation_rate: int,
|
||||
n_layers: int,
|
||||
p_dropout: float = 0,
|
||||
gin_channels: int = 0,
|
||||
mean_only: bool = False,
|
||||
):
|
||||
assert channels % 2 == 0, "channels should be divisible by 2"
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.half_channels = channels // 2
|
||||
self.mean_only = mean_only
|
||||
|
||||
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
|
||||
self.enc = WN(
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dilation_rate,
|
||||
n_layers,
|
||||
p_dropout=p_dropout,
|
||||
gin_channels=gin_channels,
|
||||
)
|
||||
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
|
||||
self.post.weight.data.zero_()
|
||||
self.post.bias.data.zero_()
|
||||
|
||||
def forward(self, x, x_mask, g=None, reverse=False):
|
||||
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
||||
h = self.pre(x0) * x_mask
|
||||
h = self.enc(h, x_mask, g=g)
|
||||
stats = self.post(h) * x_mask
|
||||
if not self.mean_only:
|
||||
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
|
||||
else:
|
||||
m = stats
|
||||
logs = torch.zeros_like(m)
|
||||
|
||||
if not reverse:
|
||||
x1 = m + x1 * torch.exp(logs) * x_mask
|
||||
x = torch.cat([x0, x1], 1)
|
||||
logdet = torch.sum(logs, [1, 2])
|
||||
return x, logdet
|
||||
else:
|
||||
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
||||
x = torch.cat([x0, x1], 1)
|
||||
return x
|
||||
|
||||
|
||||
class ConvFlow(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
filter_channels: int,
|
||||
kernel_size: int,
|
||||
n_layers: int,
|
||||
num_bins: int = 10,
|
||||
tail_bound: float = 5.0,
|
||||
):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.n_layers = n_layers
|
||||
self.num_bins = num_bins
|
||||
self.tail_bound = tail_bound
|
||||
self.half_channels = in_channels // 2
|
||||
|
||||
self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
|
||||
self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
|
||||
self.proj = nn.Conv1d(
|
||||
filter_channels, self.half_channels * (num_bins * 3 - 1), 1
|
||||
)
|
||||
self.proj.weight.data.zero_()
|
||||
self.proj.bias.data.zero_()
|
||||
|
||||
def forward(self, x, x_mask, g=None, reverse=False):
|
||||
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
||||
h = self.pre(x0)
|
||||
h = self.convs(h, x_mask, g=g)
|
||||
h = self.proj(h) * x_mask
|
||||
|
||||
b, c, t = x0.shape
|
||||
h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
|
||||
|
||||
unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
|
||||
unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
|
||||
self.filter_channels
|
||||
)
|
||||
unnormalized_derivatives = h[..., 2 * self.num_bins :]
|
||||
|
||||
x1, logabsdet = piecewise_rational_quadratic_transform(
|
||||
x1,
|
||||
unnormalized_widths,
|
||||
unnormalized_heights,
|
||||
unnormalized_derivatives,
|
||||
inverse=reverse,
|
||||
tails="linear",
|
||||
tail_bound=self.tail_bound,
|
||||
)
|
||||
|
||||
x = torch.cat([x0, x1], 1) * x_mask
|
||||
|
||||
logdet = torch.sum(logabsdet * x_mask, [1, 2])
|
||||
if not reverse:
|
||||
return x, logdet
|
||||
else:
|
||||
return x
|
||||
@@ -0,0 +1,2 @@
|
||||
all:
|
||||
python3 setup.py build_ext --inplace
|
||||
@@ -0,0 +1,20 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from .monotonic_align.core import maximum_path_c
|
||||
|
||||
|
||||
def maximum_path(neg_cent, mask):
|
||||
"""Cython optimized version.
|
||||
neg_cent: [b, t_t, t_s]
|
||||
mask: [b, t_t, t_s]
|
||||
"""
|
||||
device = neg_cent.device
|
||||
dtype = neg_cent.dtype
|
||||
neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
|
||||
path = np.zeros(neg_cent.shape, dtype=np.int32)
|
||||
|
||||
t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
|
||||
t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
|
||||
maximum_path_c(path, neg_cent, t_t_max, t_s_max)
|
||||
return torch.from_numpy(path).to(device=device, dtype=dtype)
|
||||
21608
mlu_370-piper/piper/src/python/piper_train/vits/monotonic_align/core.c
Normal file
21608
mlu_370-piper/piper/src/python/piper_train/vits/monotonic_align/core.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,42 @@
|
||||
cimport cython
|
||||
from cython.parallel import prange
|
||||
|
||||
|
||||
@cython.boundscheck(False)
|
||||
@cython.wraparound(False)
|
||||
cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
|
||||
cdef int x
|
||||
cdef int y
|
||||
cdef float v_prev
|
||||
cdef float v_cur
|
||||
cdef float tmp
|
||||
cdef int index = t_x - 1
|
||||
|
||||
for y in range(t_y):
|
||||
for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
|
||||
if x == y:
|
||||
v_cur = max_neg_val
|
||||
else:
|
||||
v_cur = value[y-1, x]
|
||||
if x == 0:
|
||||
if y == 0:
|
||||
v_prev = 0.
|
||||
else:
|
||||
v_prev = max_neg_val
|
||||
else:
|
||||
v_prev = value[y-1, x-1]
|
||||
value[y, x] += max(v_prev, v_cur)
|
||||
|
||||
for y in range(t_y - 1, -1, -1):
|
||||
path[y, index] = 1
|
||||
if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
|
||||
index = index - 1
|
||||
|
||||
|
||||
@cython.boundscheck(False)
|
||||
@cython.wraparound(False)
|
||||
cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
|
||||
cdef int b = paths.shape[0]
|
||||
cdef int i
|
||||
for i in prange(b, nogil=True):
|
||||
maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])
|
||||
@@ -0,0 +1,13 @@
|
||||
from distutils.core import setup
|
||||
from pathlib import Path
|
||||
|
||||
import numpy
|
||||
from Cython.Build import cythonize
|
||||
|
||||
_DIR = Path(__file__).parent
|
||||
|
||||
setup(
|
||||
name="monotonic_align",
|
||||
ext_modules=cythonize(str(_DIR / "core.pyx")),
|
||||
include_dirs=[numpy.get_include()],
|
||||
)
|
||||
212
mlu_370-piper/piper/src/python/piper_train/vits/transforms.py
Normal file
212
mlu_370-piper/piper/src/python/piper_train/vits/transforms.py
Normal file
@@ -0,0 +1,212 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
DEFAULT_MIN_BIN_WIDTH = 1e-3
|
||||
DEFAULT_MIN_BIN_HEIGHT = 1e-3
|
||||
DEFAULT_MIN_DERIVATIVE = 1e-3
|
||||
|
||||
|
||||
def piecewise_rational_quadratic_transform(
|
||||
inputs,
|
||||
unnormalized_widths,
|
||||
unnormalized_heights,
|
||||
unnormalized_derivatives,
|
||||
inverse=False,
|
||||
tails=None,
|
||||
tail_bound=1.0,
|
||||
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
||||
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
||||
min_derivative=DEFAULT_MIN_DERIVATIVE,
|
||||
):
|
||||
|
||||
if tails is None:
|
||||
spline_fn = rational_quadratic_spline
|
||||
spline_kwargs = {}
|
||||
else:
|
||||
spline_fn = unconstrained_rational_quadratic_spline
|
||||
spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
|
||||
|
||||
outputs, logabsdet = spline_fn(
|
||||
inputs=inputs,
|
||||
unnormalized_widths=unnormalized_widths,
|
||||
unnormalized_heights=unnormalized_heights,
|
||||
unnormalized_derivatives=unnormalized_derivatives,
|
||||
inverse=inverse,
|
||||
min_bin_width=min_bin_width,
|
||||
min_bin_height=min_bin_height,
|
||||
min_derivative=min_derivative,
|
||||
**spline_kwargs
|
||||
)
|
||||
return outputs, logabsdet
|
||||
|
||||
|
||||
def searchsorted(bin_locations, inputs, eps=1e-6):
|
||||
# bin_locations[..., -1] += eps
|
||||
bin_locations[..., bin_locations.size(-1) - 1] += eps
|
||||
return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
|
||||
|
||||
|
||||
def unconstrained_rational_quadratic_spline(
|
||||
inputs,
|
||||
unnormalized_widths,
|
||||
unnormalized_heights,
|
||||
unnormalized_derivatives,
|
||||
inverse=False,
|
||||
tails="linear",
|
||||
tail_bound=1.0,
|
||||
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
||||
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
||||
min_derivative=DEFAULT_MIN_DERIVATIVE,
|
||||
):
|
||||
inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
|
||||
outside_interval_mask = ~inside_interval_mask
|
||||
|
||||
outputs = torch.zeros_like(inputs)
|
||||
logabsdet = torch.zeros_like(inputs)
|
||||
|
||||
if tails == "linear":
|
||||
unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
|
||||
constant = np.log(np.exp(1 - min_derivative) - 1)
|
||||
unnormalized_derivatives[..., 0] = constant
|
||||
# unnormalized_derivatives[..., -1] = constant
|
||||
unnormalized_derivatives[..., unnormalized_derivatives.size(-1) - 1] = constant
|
||||
|
||||
outputs[outside_interval_mask] = inputs[outside_interval_mask]
|
||||
logabsdet[outside_interval_mask] = 0
|
||||
else:
|
||||
raise RuntimeError("{} tails are not implemented.".format(tails))
|
||||
|
||||
(
|
||||
outputs[inside_interval_mask],
|
||||
logabsdet[inside_interval_mask],
|
||||
) = rational_quadratic_spline(
|
||||
inputs=inputs[inside_interval_mask],
|
||||
unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
|
||||
unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
|
||||
unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
|
||||
inverse=inverse,
|
||||
left=-tail_bound,
|
||||
right=tail_bound,
|
||||
bottom=-tail_bound,
|
||||
top=tail_bound,
|
||||
min_bin_width=min_bin_width,
|
||||
min_bin_height=min_bin_height,
|
||||
min_derivative=min_derivative,
|
||||
)
|
||||
|
||||
return outputs, logabsdet
|
||||
|
||||
|
||||
def rational_quadratic_spline(
|
||||
inputs,
|
||||
unnormalized_widths,
|
||||
unnormalized_heights,
|
||||
unnormalized_derivatives,
|
||||
inverse=False,
|
||||
left=0.0,
|
||||
right=1.0,
|
||||
bottom=0.0,
|
||||
top=1.0,
|
||||
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
||||
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
||||
min_derivative=DEFAULT_MIN_DERIVATIVE,
|
||||
):
|
||||
# if torch.min(inputs) < left or torch.max(inputs) > right:
|
||||
# raise ValueError("Input to a transform is not within its domain")
|
||||
|
||||
num_bins = unnormalized_widths.shape[-1]
|
||||
|
||||
# if min_bin_width * num_bins > 1.0:
|
||||
# raise ValueError("Minimal bin width too large for the number of bins")
|
||||
# if min_bin_height * num_bins > 1.0:
|
||||
# raise ValueError("Minimal bin height too large for the number of bins")
|
||||
|
||||
widths = F.softmax(unnormalized_widths, dim=-1)
|
||||
widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
|
||||
cumwidths = torch.cumsum(widths, dim=-1)
|
||||
cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
|
||||
cumwidths = (right - left) * cumwidths + left
|
||||
cumwidths[..., 0] = left
|
||||
# cumwidths[..., -1] = right
|
||||
cumwidths[..., cumwidths.size(-1) - 1] = right
|
||||
widths = cumwidths[..., 1:] - cumwidths[..., :-1]
|
||||
|
||||
derivatives = min_derivative + F.softplus(unnormalized_derivatives)
|
||||
|
||||
heights = F.softmax(unnormalized_heights, dim=-1)
|
||||
heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
|
||||
cumheights = torch.cumsum(heights, dim=-1)
|
||||
cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
|
||||
cumheights = (top - bottom) * cumheights + bottom
|
||||
cumheights[..., 0] = bottom
|
||||
# cumheights[..., -1] = top
|
||||
cumheights[..., cumheights.size(-1) - 1] = top
|
||||
heights = cumheights[..., 1:] - cumheights[..., :-1]
|
||||
|
||||
if inverse:
|
||||
bin_idx = searchsorted(cumheights, inputs)[..., None]
|
||||
else:
|
||||
bin_idx = searchsorted(cumwidths, inputs)[..., None]
|
||||
|
||||
input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
|
||||
input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
|
||||
|
||||
input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
|
||||
delta = heights / widths
|
||||
input_delta = delta.gather(-1, bin_idx)[..., 0]
|
||||
|
||||
input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
|
||||
input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
|
||||
|
||||
input_heights = heights.gather(-1, bin_idx)[..., 0]
|
||||
|
||||
if inverse:
|
||||
a = (inputs - input_cumheights) * (
|
||||
input_derivatives + input_derivatives_plus_one - 2 * input_delta
|
||||
) + input_heights * (input_delta - input_derivatives)
|
||||
b = input_heights * input_derivatives - (inputs - input_cumheights) * (
|
||||
input_derivatives + input_derivatives_plus_one - 2 * input_delta
|
||||
)
|
||||
c = -input_delta * (inputs - input_cumheights)
|
||||
|
||||
discriminant = b.pow(2) - 4 * a * c
|
||||
assert (discriminant >= 0).all(), discriminant
|
||||
|
||||
root = (2 * c) / (-b - torch.sqrt(discriminant))
|
||||
outputs = root * input_bin_widths + input_cumwidths
|
||||
|
||||
theta_one_minus_theta = root * (1 - root)
|
||||
denominator = input_delta + (
|
||||
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
|
||||
* theta_one_minus_theta
|
||||
)
|
||||
derivative_numerator = input_delta.pow(2) * (
|
||||
input_derivatives_plus_one * root.pow(2)
|
||||
+ 2 * input_delta * theta_one_minus_theta
|
||||
+ input_derivatives * (1 - root).pow(2)
|
||||
)
|
||||
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
|
||||
|
||||
return outputs, -logabsdet
|
||||
|
||||
theta = (inputs - input_cumwidths) / input_bin_widths
|
||||
theta_one_minus_theta = theta * (1 - theta)
|
||||
|
||||
numerator = input_heights * (
|
||||
input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
|
||||
)
|
||||
denominator = input_delta + (
|
||||
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
|
||||
* theta_one_minus_theta
|
||||
)
|
||||
outputs = input_cumheights + numerator / denominator
|
||||
|
||||
derivative_numerator = input_delta.pow(2) * (
|
||||
input_derivatives_plus_one * theta.pow(2)
|
||||
+ 2 * input_delta * theta_one_minus_theta
|
||||
+ input_derivatives * (1 - theta).pow(2)
|
||||
)
|
||||
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
|
||||
|
||||
return outputs, logabsdet
|
||||
16
mlu_370-piper/piper/src/python/piper_train/vits/utils.py
Normal file
16
mlu_370-piper/piper/src/python/piper_train/vits/utils.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
def to_gpu(x: torch.Tensor) -> torch.Tensor:
|
||||
return x.contiguous().cuda(non_blocking=True)
|
||||
|
||||
|
||||
def audio_float_to_int16(
|
||||
audio: np.ndarray, max_wav_value: float = 32767.0
|
||||
) -> np.ndarray:
|
||||
"""Normalize audio and convert to int16 range"""
|
||||
audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
|
||||
audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
|
||||
audio_norm = audio_norm.astype("int16")
|
||||
return audio_norm
|
||||
860
mlu_370-piper/piper/src/python/piper_train/vits/wavfile.py
Normal file
860
mlu_370-piper/piper/src/python/piper_train/vits/wavfile.py
Normal file
@@ -0,0 +1,860 @@
|
||||
"""
|
||||
Module to read / write wav files using NumPy arrays
|
||||
|
||||
Functions
|
||||
---------
|
||||
`read`: Return the sample rate (in samples/sec) and data from a WAV file.
|
||||
|
||||
`write`: Write a NumPy array as a WAV file.
|
||||
|
||||
"""
|
||||
import io
|
||||
import struct
|
||||
import sys
|
||||
import warnings
|
||||
from enum import IntEnum
|
||||
|
||||
import numpy
|
||||
|
||||
__all__ = ["WavFileWarning", "read", "write"]
|
||||
|
||||
|
||||
class WavFileWarning(UserWarning):
|
||||
pass
|
||||
|
||||
|
||||
class WAVE_FORMAT(IntEnum):
|
||||
"""
|
||||
WAVE form wFormatTag IDs
|
||||
|
||||
Complete list is in mmreg.h in Windows 10 SDK. ALAC and OPUS are the
|
||||
newest additions, in v10.0.14393 2016-07
|
||||
"""
|
||||
|
||||
UNKNOWN = 0x0000
|
||||
PCM = 0x0001
|
||||
ADPCM = 0x0002
|
||||
IEEE_FLOAT = 0x0003
|
||||
VSELP = 0x0004
|
||||
IBM_CVSD = 0x0005
|
||||
ALAW = 0x0006
|
||||
MULAW = 0x0007
|
||||
DTS = 0x0008
|
||||
DRM = 0x0009
|
||||
WMAVOICE9 = 0x000A
|
||||
WMAVOICE10 = 0x000B
|
||||
OKI_ADPCM = 0x0010
|
||||
DVI_ADPCM = 0x0011
|
||||
IMA_ADPCM = 0x0011 # Duplicate
|
||||
MEDIASPACE_ADPCM = 0x0012
|
||||
SIERRA_ADPCM = 0x0013
|
||||
G723_ADPCM = 0x0014
|
||||
DIGISTD = 0x0015
|
||||
DIGIFIX = 0x0016
|
||||
DIALOGIC_OKI_ADPCM = 0x0017
|
||||
MEDIAVISION_ADPCM = 0x0018
|
||||
CU_CODEC = 0x0019
|
||||
HP_DYN_VOICE = 0x001A
|
||||
YAMAHA_ADPCM = 0x0020
|
||||
SONARC = 0x0021
|
||||
DSPGROUP_TRUESPEECH = 0x0022
|
||||
ECHOSC1 = 0x0023
|
||||
AUDIOFILE_AF36 = 0x0024
|
||||
APTX = 0x0025
|
||||
AUDIOFILE_AF10 = 0x0026
|
||||
PROSODY_1612 = 0x0027
|
||||
LRC = 0x0028
|
||||
DOLBY_AC2 = 0x0030
|
||||
GSM610 = 0x0031
|
||||
MSNAUDIO = 0x0032
|
||||
ANTEX_ADPCME = 0x0033
|
||||
CONTROL_RES_VQLPC = 0x0034
|
||||
DIGIREAL = 0x0035
|
||||
DIGIADPCM = 0x0036
|
||||
CONTROL_RES_CR10 = 0x0037
|
||||
NMS_VBXADPCM = 0x0038
|
||||
CS_IMAADPCM = 0x0039
|
||||
ECHOSC3 = 0x003A
|
||||
ROCKWELL_ADPCM = 0x003B
|
||||
ROCKWELL_DIGITALK = 0x003C
|
||||
XEBEC = 0x003D
|
||||
G721_ADPCM = 0x0040
|
||||
G728_CELP = 0x0041
|
||||
MSG723 = 0x0042
|
||||
INTEL_G723_1 = 0x0043
|
||||
INTEL_G729 = 0x0044
|
||||
SHARP_G726 = 0x0045
|
||||
MPEG = 0x0050
|
||||
RT24 = 0x0052
|
||||
PAC = 0x0053
|
||||
MPEGLAYER3 = 0x0055
|
||||
LUCENT_G723 = 0x0059
|
||||
CIRRUS = 0x0060
|
||||
ESPCM = 0x0061
|
||||
VOXWARE = 0x0062
|
||||
CANOPUS_ATRAC = 0x0063
|
||||
G726_ADPCM = 0x0064
|
||||
G722_ADPCM = 0x0065
|
||||
DSAT = 0x0066
|
||||
DSAT_DISPLAY = 0x0067
|
||||
VOXWARE_BYTE_ALIGNED = 0x0069
|
||||
VOXWARE_AC8 = 0x0070
|
||||
VOXWARE_AC10 = 0x0071
|
||||
VOXWARE_AC16 = 0x0072
|
||||
VOXWARE_AC20 = 0x0073
|
||||
VOXWARE_RT24 = 0x0074
|
||||
VOXWARE_RT29 = 0x0075
|
||||
VOXWARE_RT29HW = 0x0076
|
||||
VOXWARE_VR12 = 0x0077
|
||||
VOXWARE_VR18 = 0x0078
|
||||
VOXWARE_TQ40 = 0x0079
|
||||
VOXWARE_SC3 = 0x007A
|
||||
VOXWARE_SC3_1 = 0x007B
|
||||
SOFTSOUND = 0x0080
|
||||
VOXWARE_TQ60 = 0x0081
|
||||
MSRT24 = 0x0082
|
||||
G729A = 0x0083
|
||||
MVI_MVI2 = 0x0084
|
||||
DF_G726 = 0x0085
|
||||
DF_GSM610 = 0x0086
|
||||
ISIAUDIO = 0x0088
|
||||
ONLIVE = 0x0089
|
||||
MULTITUDE_FT_SX20 = 0x008A
|
||||
INFOCOM_ITS_G721_ADPCM = 0x008B
|
||||
CONVEDIA_G729 = 0x008C
|
||||
CONGRUENCY = 0x008D
|
||||
SBC24 = 0x0091
|
||||
DOLBY_AC3_SPDIF = 0x0092
|
||||
MEDIASONIC_G723 = 0x0093
|
||||
PROSODY_8KBPS = 0x0094
|
||||
ZYXEL_ADPCM = 0x0097
|
||||
PHILIPS_LPCBB = 0x0098
|
||||
PACKED = 0x0099
|
||||
MALDEN_PHONYTALK = 0x00A0
|
||||
RACAL_RECORDER_GSM = 0x00A1
|
||||
RACAL_RECORDER_G720_A = 0x00A2
|
||||
RACAL_RECORDER_G723_1 = 0x00A3
|
||||
RACAL_RECORDER_TETRA_ACELP = 0x00A4
|
||||
NEC_AAC = 0x00B0
|
||||
RAW_AAC1 = 0x00FF
|
||||
RHETOREX_ADPCM = 0x0100
|
||||
IRAT = 0x0101
|
||||
VIVO_G723 = 0x0111
|
||||
VIVO_SIREN = 0x0112
|
||||
PHILIPS_CELP = 0x0120
|
||||
PHILIPS_GRUNDIG = 0x0121
|
||||
DIGITAL_G723 = 0x0123
|
||||
SANYO_LD_ADPCM = 0x0125
|
||||
SIPROLAB_ACEPLNET = 0x0130
|
||||
SIPROLAB_ACELP4800 = 0x0131
|
||||
SIPROLAB_ACELP8V3 = 0x0132
|
||||
SIPROLAB_G729 = 0x0133
|
||||
SIPROLAB_G729A = 0x0134
|
||||
SIPROLAB_KELVIN = 0x0135
|
||||
VOICEAGE_AMR = 0x0136
|
||||
G726ADPCM = 0x0140
|
||||
DICTAPHONE_CELP68 = 0x0141
|
||||
DICTAPHONE_CELP54 = 0x0142
|
||||
QUALCOMM_PUREVOICE = 0x0150
|
||||
QUALCOMM_HALFRATE = 0x0151
|
||||
TUBGSM = 0x0155
|
||||
MSAUDIO1 = 0x0160
|
||||
WMAUDIO2 = 0x0161
|
||||
WMAUDIO3 = 0x0162
|
||||
WMAUDIO_LOSSLESS = 0x0163
|
||||
WMASPDIF = 0x0164
|
||||
UNISYS_NAP_ADPCM = 0x0170
|
||||
UNISYS_NAP_ULAW = 0x0171
|
||||
UNISYS_NAP_ALAW = 0x0172
|
||||
UNISYS_NAP_16K = 0x0173
|
||||
SYCOM_ACM_SYC008 = 0x0174
|
||||
SYCOM_ACM_SYC701_G726L = 0x0175
|
||||
SYCOM_ACM_SYC701_CELP54 = 0x0176
|
||||
SYCOM_ACM_SYC701_CELP68 = 0x0177
|
||||
KNOWLEDGE_ADVENTURE_ADPCM = 0x0178
|
||||
FRAUNHOFER_IIS_MPEG2_AAC = 0x0180
|
||||
DTS_DS = 0x0190
|
||||
CREATIVE_ADPCM = 0x0200
|
||||
CREATIVE_FASTSPEECH8 = 0x0202
|
||||
CREATIVE_FASTSPEECH10 = 0x0203
|
||||
UHER_ADPCM = 0x0210
|
||||
ULEAD_DV_AUDIO = 0x0215
|
||||
ULEAD_DV_AUDIO_1 = 0x0216
|
||||
QUARTERDECK = 0x0220
|
||||
ILINK_VC = 0x0230
|
||||
RAW_SPORT = 0x0240
|
||||
ESST_AC3 = 0x0241
|
||||
GENERIC_PASSTHRU = 0x0249
|
||||
IPI_HSX = 0x0250
|
||||
IPI_RPELP = 0x0251
|
||||
CS2 = 0x0260
|
||||
SONY_SCX = 0x0270
|
||||
SONY_SCY = 0x0271
|
||||
SONY_ATRAC3 = 0x0272
|
||||
SONY_SPC = 0x0273
|
||||
TELUM_AUDIO = 0x0280
|
||||
TELUM_IA_AUDIO = 0x0281
|
||||
NORCOM_VOICE_SYSTEMS_ADPCM = 0x0285
|
||||
FM_TOWNS_SND = 0x0300
|
||||
MICRONAS = 0x0350
|
||||
MICRONAS_CELP833 = 0x0351
|
||||
BTV_DIGITAL = 0x0400
|
||||
INTEL_MUSIC_CODER = 0x0401
|
||||
INDEO_AUDIO = 0x0402
|
||||
QDESIGN_MUSIC = 0x0450
|
||||
ON2_VP7_AUDIO = 0x0500
|
||||
ON2_VP6_AUDIO = 0x0501
|
||||
VME_VMPCM = 0x0680
|
||||
TPC = 0x0681
|
||||
LIGHTWAVE_LOSSLESS = 0x08AE
|
||||
OLIGSM = 0x1000
|
||||
OLIADPCM = 0x1001
|
||||
OLICELP = 0x1002
|
||||
OLISBC = 0x1003
|
||||
OLIOPR = 0x1004
|
||||
LH_CODEC = 0x1100
|
||||
LH_CODEC_CELP = 0x1101
|
||||
LH_CODEC_SBC8 = 0x1102
|
||||
LH_CODEC_SBC12 = 0x1103
|
||||
LH_CODEC_SBC16 = 0x1104
|
||||
NORRIS = 0x1400
|
||||
ISIAUDIO_2 = 0x1401
|
||||
SOUNDSPACE_MUSICOMPRESS = 0x1500
|
||||
MPEG_ADTS_AAC = 0x1600
|
||||
MPEG_RAW_AAC = 0x1601
|
||||
MPEG_LOAS = 0x1602
|
||||
NOKIA_MPEG_ADTS_AAC = 0x1608
|
||||
NOKIA_MPEG_RAW_AAC = 0x1609
|
||||
VODAFONE_MPEG_ADTS_AAC = 0x160A
|
||||
VODAFONE_MPEG_RAW_AAC = 0x160B
|
||||
MPEG_HEAAC = 0x1610
|
||||
VOXWARE_RT24_SPEECH = 0x181C
|
||||
SONICFOUNDRY_LOSSLESS = 0x1971
|
||||
INNINGS_TELECOM_ADPCM = 0x1979
|
||||
LUCENT_SX8300P = 0x1C07
|
||||
LUCENT_SX5363S = 0x1C0C
|
||||
CUSEEME = 0x1F03
|
||||
NTCSOFT_ALF2CM_ACM = 0x1FC4
|
||||
DVM = 0x2000
|
||||
DTS2 = 0x2001
|
||||
MAKEAVIS = 0x3313
|
||||
DIVIO_MPEG4_AAC = 0x4143
|
||||
NOKIA_ADAPTIVE_MULTIRATE = 0x4201
|
||||
DIVIO_G726 = 0x4243
|
||||
LEAD_SPEECH = 0x434C
|
||||
LEAD_VORBIS = 0x564C
|
||||
WAVPACK_AUDIO = 0x5756
|
||||
OGG_VORBIS_MODE_1 = 0x674F
|
||||
OGG_VORBIS_MODE_2 = 0x6750
|
||||
OGG_VORBIS_MODE_3 = 0x6751
|
||||
OGG_VORBIS_MODE_1_PLUS = 0x676F
|
||||
OGG_VORBIS_MODE_2_PLUS = 0x6770
|
||||
OGG_VORBIS_MODE_3_PLUS = 0x6771
|
||||
ALAC = 0x6C61
|
||||
_3COM_NBX = 0x7000 # Can't have leading digit
|
||||
OPUS = 0x704F
|
||||
FAAD_AAC = 0x706D
|
||||
AMR_NB = 0x7361
|
||||
AMR_WB = 0x7362
|
||||
AMR_WP = 0x7363
|
||||
GSM_AMR_CBR = 0x7A21
|
||||
GSM_AMR_VBR_SID = 0x7A22
|
||||
COMVERSE_INFOSYS_G723_1 = 0xA100
|
||||
COMVERSE_INFOSYS_AVQSBC = 0xA101
|
||||
COMVERSE_INFOSYS_SBC = 0xA102
|
||||
SYMBOL_G729_A = 0xA103
|
||||
VOICEAGE_AMR_WB = 0xA104
|
||||
INGENIENT_G726 = 0xA105
|
||||
MPEG4_AAC = 0xA106
|
||||
ENCORE_G726 = 0xA107
|
||||
ZOLL_ASAO = 0xA108
|
||||
SPEEX_VOICE = 0xA109
|
||||
VIANIX_MASC = 0xA10A
|
||||
WM9_SPECTRUM_ANALYZER = 0xA10B
|
||||
WMF_SPECTRUM_ANAYZER = 0xA10C
|
||||
GSM_610 = 0xA10D
|
||||
GSM_620 = 0xA10E
|
||||
GSM_660 = 0xA10F
|
||||
GSM_690 = 0xA110
|
||||
GSM_ADAPTIVE_MULTIRATE_WB = 0xA111
|
||||
POLYCOM_G722 = 0xA112
|
||||
POLYCOM_G728 = 0xA113
|
||||
POLYCOM_G729_A = 0xA114
|
||||
POLYCOM_SIREN = 0xA115
|
||||
GLOBAL_IP_ILBC = 0xA116
|
||||
RADIOTIME_TIME_SHIFT_RADIO = 0xA117
|
||||
NICE_ACA = 0xA118
|
||||
NICE_ADPCM = 0xA119
|
||||
VOCORD_G721 = 0xA11A
|
||||
VOCORD_G726 = 0xA11B
|
||||
VOCORD_G722_1 = 0xA11C
|
||||
VOCORD_G728 = 0xA11D
|
||||
VOCORD_G729 = 0xA11E
|
||||
VOCORD_G729_A = 0xA11F
|
||||
VOCORD_G723_1 = 0xA120
|
||||
VOCORD_LBC = 0xA121
|
||||
NICE_G728 = 0xA122
|
||||
FRACE_TELECOM_G729 = 0xA123
|
||||
CODIAN = 0xA124
|
||||
FLAC = 0xF1AC
|
||||
EXTENSIBLE = 0xFFFE
|
||||
DEVELOPMENT = 0xFFFF
|
||||
|
||||
|
||||
KNOWN_WAVE_FORMATS = {WAVE_FORMAT.PCM, WAVE_FORMAT.IEEE_FLOAT}
|
||||
|
||||
|
||||
def _raise_bad_format(format_tag):
|
||||
try:
|
||||
format_name = WAVE_FORMAT(format_tag).name
|
||||
except ValueError:
|
||||
format_name = f"{format_tag:#06x}"
|
||||
raise ValueError(
|
||||
f"Unknown wave file format: {format_name}. Supported "
|
||||
"formats: " + ", ".join(x.name for x in KNOWN_WAVE_FORMATS)
|
||||
)
|
||||
|
||||
|
||||
def _read_fmt_chunk(fid, is_big_endian):
|
||||
"""
|
||||
Returns
|
||||
-------
|
||||
size : int
|
||||
size of format subchunk in bytes (minus 8 for "fmt " and itself)
|
||||
format_tag : int
|
||||
PCM, float, or compressed format
|
||||
channels : int
|
||||
number of channels
|
||||
fs : int
|
||||
sampling frequency in samples per second
|
||||
bytes_per_second : int
|
||||
overall byte rate for the file
|
||||
block_align : int
|
||||
bytes per sample, including all channels
|
||||
bit_depth : int
|
||||
bits per sample
|
||||
|
||||
Notes
|
||||
-----
|
||||
Assumes file pointer is immediately after the 'fmt ' id
|
||||
"""
|
||||
if is_big_endian:
|
||||
fmt = ">"
|
||||
else:
|
||||
fmt = "<"
|
||||
|
||||
size = struct.unpack(fmt + "I", fid.read(4))[0]
|
||||
|
||||
if size < 16:
|
||||
raise ValueError("Binary structure of wave file is not compliant")
|
||||
|
||||
res = struct.unpack(fmt + "HHIIHH", fid.read(16))
|
||||
bytes_read = 16
|
||||
|
||||
format_tag, channels, fs, bytes_per_second, block_align, bit_depth = res
|
||||
|
||||
if format_tag == WAVE_FORMAT.EXTENSIBLE and size >= (16 + 2):
|
||||
ext_chunk_size = struct.unpack(fmt + "H", fid.read(2))[0]
|
||||
bytes_read += 2
|
||||
if ext_chunk_size >= 22:
|
||||
extensible_chunk_data = fid.read(22)
|
||||
bytes_read += 22
|
||||
raw_guid = extensible_chunk_data[2 + 4 : 2 + 4 + 16]
|
||||
# GUID template {XXXXXXXX-0000-0010-8000-00AA00389B71} (RFC-2361)
|
||||
# MS GUID byte order: first three groups are native byte order,
|
||||
# rest is Big Endian
|
||||
if is_big_endian:
|
||||
tail = b"\x00\x00\x00\x10\x80\x00\x00\xAA\x00\x38\x9B\x71"
|
||||
else:
|
||||
tail = b"\x00\x00\x10\x00\x80\x00\x00\xAA\x00\x38\x9B\x71"
|
||||
if raw_guid.endswith(tail):
|
||||
format_tag = struct.unpack(fmt + "I", raw_guid[:4])[0]
|
||||
else:
|
||||
raise ValueError("Binary structure of wave file is not compliant")
|
||||
|
||||
if format_tag not in KNOWN_WAVE_FORMATS:
|
||||
_raise_bad_format(format_tag)
|
||||
|
||||
# move file pointer to next chunk
|
||||
if size > bytes_read:
|
||||
fid.read(size - bytes_read)
|
||||
|
||||
# fmt should always be 16, 18 or 40, but handle it just in case
|
||||
_handle_pad_byte(fid, size)
|
||||
|
||||
return (size, format_tag, channels, fs, bytes_per_second, block_align, bit_depth)
|
||||
|
||||
|
||||
def _read_data_chunk(
|
||||
fid, format_tag, channels, bit_depth, is_big_endian, block_align, mmap=False
|
||||
):
|
||||
"""
|
||||
Notes
|
||||
-----
|
||||
Assumes file pointer is immediately after the 'data' id
|
||||
|
||||
It's possible to not use all available bits in a container, or to store
|
||||
samples in a container bigger than necessary, so bytes_per_sample uses
|
||||
the actual reported container size (nBlockAlign / nChannels). Real-world
|
||||
examples:
|
||||
|
||||
Adobe Audition's "24-bit packed int (type 1, 20-bit)"
|
||||
|
||||
nChannels = 2, nBlockAlign = 6, wBitsPerSample = 20
|
||||
|
||||
http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples/AFsp/M1F1-int12-AFsp.wav
|
||||
is:
|
||||
|
||||
nChannels = 2, nBlockAlign = 4, wBitsPerSample = 12
|
||||
|
||||
http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Docs/multichaudP.pdf
|
||||
gives an example of:
|
||||
|
||||
nChannels = 2, nBlockAlign = 8, wBitsPerSample = 20
|
||||
"""
|
||||
if is_big_endian:
|
||||
fmt = ">"
|
||||
else:
|
||||
fmt = "<"
|
||||
|
||||
# Size of the data subchunk in bytes
|
||||
size = struct.unpack(fmt + "I", fid.read(4))[0]
|
||||
|
||||
# Number of bytes per sample (sample container size)
|
||||
bytes_per_sample = block_align // channels
|
||||
n_samples = size // bytes_per_sample
|
||||
|
||||
if format_tag == WAVE_FORMAT.PCM:
|
||||
if 1 <= bit_depth <= 8:
|
||||
dtype = "u1" # WAV of 8-bit integer or less are unsigned
|
||||
elif bytes_per_sample in {3, 5, 6, 7}:
|
||||
# No compatible dtype. Load as raw bytes for reshaping later.
|
||||
dtype = "V1"
|
||||
elif bit_depth <= 64:
|
||||
# Remaining bit depths can map directly to signed numpy dtypes
|
||||
dtype = f"{fmt}i{bytes_per_sample}"
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unsupported bit depth: the WAV file "
|
||||
f"has {bit_depth}-bit integer data."
|
||||
)
|
||||
elif format_tag == WAVE_FORMAT.IEEE_FLOAT:
|
||||
if bit_depth in {32, 64}:
|
||||
dtype = f"{fmt}f{bytes_per_sample}"
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unsupported bit depth: the WAV file "
|
||||
f"has {bit_depth}-bit floating-point data."
|
||||
)
|
||||
else:
|
||||
_raise_bad_format(format_tag)
|
||||
|
||||
start = fid.tell()
|
||||
if not mmap:
|
||||
try:
|
||||
count = size if dtype == "V1" else n_samples
|
||||
data = numpy.fromfile(fid, dtype=dtype, count=count)
|
||||
except io.UnsupportedOperation: # not a C-like file
|
||||
fid.seek(start, 0) # just in case it seeked, though it shouldn't
|
||||
data = numpy.frombuffer(fid.read(size), dtype=dtype)
|
||||
|
||||
if dtype == "V1":
|
||||
# Rearrange raw bytes into smallest compatible numpy dtype
|
||||
dt = f"{fmt}i4" if bytes_per_sample == 3 else f"{fmt}i8"
|
||||
a = numpy.zeros(
|
||||
(len(data) // bytes_per_sample, numpy.dtype(dt).itemsize), dtype="V1"
|
||||
)
|
||||
if is_big_endian:
|
||||
a[:, :bytes_per_sample] = data.reshape((-1, bytes_per_sample))
|
||||
else:
|
||||
a[:, -bytes_per_sample:] = data.reshape((-1, bytes_per_sample))
|
||||
data = a.view(dt).reshape(a.shape[:-1])
|
||||
else:
|
||||
if bytes_per_sample in {1, 2, 4, 8}:
|
||||
start = fid.tell()
|
||||
data = numpy.memmap(
|
||||
fid, dtype=dtype, mode="c", offset=start, shape=(n_samples,)
|
||||
)
|
||||
fid.seek(start + size)
|
||||
else:
|
||||
raise ValueError(
|
||||
"mmap=True not compatible with "
|
||||
f"{bytes_per_sample}-byte container size."
|
||||
)
|
||||
|
||||
_handle_pad_byte(fid, size)
|
||||
|
||||
if channels > 1:
|
||||
data = data.reshape(-1, channels)
|
||||
return data
|
||||
|
||||
|
||||
def _skip_unknown_chunk(fid, is_big_endian):
|
||||
if is_big_endian:
|
||||
fmt = ">I"
|
||||
else:
|
||||
fmt = "<I"
|
||||
|
||||
data = fid.read(4)
|
||||
# call unpack() and seek() only if we have really read data from file
|
||||
# otherwise empty read at the end of the file would trigger
|
||||
# unnecessary exception at unpack() call
|
||||
# in case data equals somehow to 0, there is no need for seek() anyway
|
||||
if data:
|
||||
size = struct.unpack(fmt, data)[0]
|
||||
fid.seek(size, 1)
|
||||
_handle_pad_byte(fid, size)
|
||||
|
||||
|
||||
def _read_riff_chunk(fid):
|
||||
str1 = fid.read(4) # File signature
|
||||
if str1 == b"RIFF":
|
||||
is_big_endian = False
|
||||
fmt = "<I"
|
||||
elif str1 == b"RIFX":
|
||||
is_big_endian = True
|
||||
fmt = ">I"
|
||||
else:
|
||||
# There are also .wav files with "FFIR" or "XFIR" signatures?
|
||||
raise ValueError(
|
||||
f"File format {repr(str1)} not understood. Only "
|
||||
"'RIFF' and 'RIFX' supported."
|
||||
)
|
||||
|
||||
# Size of entire file
|
||||
file_size = struct.unpack(fmt, fid.read(4))[0] + 8
|
||||
|
||||
str2 = fid.read(4)
|
||||
if str2 != b"WAVE":
|
||||
raise ValueError(f"Not a WAV file. RIFF form type is {repr(str2)}.")
|
||||
|
||||
return file_size, is_big_endian
|
||||
|
||||
|
||||
def _handle_pad_byte(fid, size):
|
||||
# "If the chunk size is an odd number of bytes, a pad byte with value zero
|
||||
# is written after ckData." So we need to seek past this after each chunk.
|
||||
if size % 2:
|
||||
fid.seek(1, 1)
|
||||
|
||||
|
||||
def read(filename, mmap=False):
|
||||
"""
|
||||
Open a WAV file.
|
||||
|
||||
Return the sample rate (in samples/sec) and data from an LPCM WAV file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : string or open file handle
|
||||
Input WAV file.
|
||||
mmap : bool, optional
|
||||
Whether to read data as memory-mapped (default: False). Not compatible
|
||||
with some bit depths; see Notes. Only to be used on real files.
|
||||
|
||||
.. versionadded:: 0.12.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
rate : int
|
||||
Sample rate of WAV file.
|
||||
data : numpy array
|
||||
Data read from WAV file. Data-type is determined from the file;
|
||||
see Notes. Data is 1-D for 1-channel WAV, or 2-D of shape
|
||||
(Nsamples, Nchannels) otherwise. If a file-like input without a
|
||||
C-like file descriptor (e.g., :class:`python:io.BytesIO`) is
|
||||
passed, this will not be writeable.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Common data types: [1]_
|
||||
|
||||
===================== =========== =========== =============
|
||||
WAV format Min Max NumPy dtype
|
||||
===================== =========== =========== =============
|
||||
32-bit floating-point -1.0 +1.0 float32
|
||||
32-bit integer PCM -2147483648 +2147483647 int32
|
||||
24-bit integer PCM -2147483648 +2147483392 int32
|
||||
16-bit integer PCM -32768 +32767 int16
|
||||
8-bit integer PCM 0 255 uint8
|
||||
===================== =========== =========== =============
|
||||
|
||||
WAV files can specify arbitrary bit depth, and this function supports
|
||||
reading any integer PCM depth from 1 to 64 bits. Data is returned in the
|
||||
smallest compatible numpy int type, in left-justified format. 8-bit and
|
||||
lower is unsigned, while 9-bit and higher is signed.
|
||||
|
||||
For example, 24-bit data will be stored as int32, with the MSB of the
|
||||
24-bit data stored at the MSB of the int32, and typically the least
|
||||
significant byte is 0x00. (However, if a file actually contains data past
|
||||
its specified bit depth, those bits will be read and output, too. [2]_)
|
||||
|
||||
This bit justification and sign matches WAV's native internal format, which
|
||||
allows memory mapping of WAV files that use 1, 2, 4, or 8 bytes per sample
|
||||
(so 24-bit files cannot be memory-mapped, but 32-bit can).
|
||||
|
||||
IEEE float PCM in 32- or 64-bit format is supported, with or without mmap.
|
||||
Values exceeding [-1, +1] are not clipped.
|
||||
|
||||
Non-linear PCM (mu-law, A-law) is not supported.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] IBM Corporation and Microsoft Corporation, "Multimedia Programming
|
||||
Interface and Data Specifications 1.0", section "Data Format of the
|
||||
Samples", August 1991
|
||||
http://www.tactilemedia.com/info/MCI_Control_Info.html
|
||||
.. [2] Adobe Systems Incorporated, "Adobe Audition 3 User Guide", section
|
||||
"Audio file formats: 24-bit Packed Int (type 1, 20-bit)", 2007
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from os.path import dirname, join as pjoin
|
||||
>>> from scipy.io import wavfile
|
||||
>>> import scipy.io
|
||||
|
||||
Get the filename for an example .wav file from the tests/data directory.
|
||||
|
||||
>>> data_dir = pjoin(dirname(scipy.io.__file__), 'tests', 'data')
|
||||
>>> wav_fname = pjoin(data_dir, 'test-44100Hz-2ch-32bit-float-be.wav')
|
||||
|
||||
Load the .wav file contents.
|
||||
|
||||
>>> samplerate, data = wavfile.read(wav_fname)
|
||||
>>> print(f"number of channels = {data.shape[1]}")
|
||||
number of channels = 2
|
||||
>>> length = data.shape[0] / samplerate
|
||||
>>> print(f"length = {length}s")
|
||||
length = 0.01s
|
||||
|
||||
Plot the waveform.
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> import numpy as np
|
||||
>>> time = np.linspace(0., length, data.shape[0])
|
||||
>>> plt.plot(time, data[:, 0], label="Left channel")
|
||||
>>> plt.plot(time, data[:, 1], label="Right channel")
|
||||
>>> plt.legend()
|
||||
>>> plt.xlabel("Time [s]")
|
||||
>>> plt.ylabel("Amplitude")
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
if hasattr(filename, "read"):
|
||||
fid = filename
|
||||
mmap = False
|
||||
else:
|
||||
# pylint: disable=consider-using-with
|
||||
fid = open(filename, "rb")
|
||||
|
||||
try:
|
||||
file_size, is_big_endian = _read_riff_chunk(fid)
|
||||
fmt_chunk_received = False
|
||||
data_chunk_received = False
|
||||
while fid.tell() < file_size:
|
||||
# read the next chunk
|
||||
chunk_id = fid.read(4)
|
||||
|
||||
if not chunk_id:
|
||||
if data_chunk_received:
|
||||
# End of file but data successfully read
|
||||
warnings.warn(
|
||||
f"Reached EOF prematurely; finished at {fid.tell()} bytes, "
|
||||
"expected {file_size} bytes from header.",
|
||||
WavFileWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
break
|
||||
|
||||
raise ValueError("Unexpected end of file.")
|
||||
if len(chunk_id) < 4:
|
||||
msg = f"Incomplete chunk ID: {repr(chunk_id)}"
|
||||
# If we have the data, ignore the broken chunk
|
||||
if fmt_chunk_received and data_chunk_received:
|
||||
warnings.warn(msg + ", ignoring it.", WavFileWarning, stacklevel=2)
|
||||
else:
|
||||
raise ValueError(msg)
|
||||
|
||||
if chunk_id == b"fmt ":
|
||||
fmt_chunk_received = True
|
||||
fmt_chunk = _read_fmt_chunk(fid, is_big_endian)
|
||||
format_tag, channels, fs = fmt_chunk[1:4]
|
||||
bit_depth = fmt_chunk[6]
|
||||
block_align = fmt_chunk[5]
|
||||
elif chunk_id == b"fact":
|
||||
_skip_unknown_chunk(fid, is_big_endian)
|
||||
elif chunk_id == b"data":
|
||||
data_chunk_received = True
|
||||
if not fmt_chunk_received:
|
||||
raise ValueError("No fmt chunk before data")
|
||||
data = _read_data_chunk(
|
||||
fid,
|
||||
format_tag,
|
||||
channels,
|
||||
bit_depth,
|
||||
is_big_endian,
|
||||
block_align,
|
||||
mmap,
|
||||
)
|
||||
elif chunk_id == b"LIST":
|
||||
# Someday this could be handled properly but for now skip it
|
||||
_skip_unknown_chunk(fid, is_big_endian)
|
||||
elif chunk_id in {b"JUNK", b"Fake"}:
|
||||
# Skip alignment chunks without warning
|
||||
_skip_unknown_chunk(fid, is_big_endian)
|
||||
else:
|
||||
warnings.warn(
|
||||
"Chunk (non-data) not understood, skipping it.",
|
||||
WavFileWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
_skip_unknown_chunk(fid, is_big_endian)
|
||||
finally:
|
||||
if not hasattr(filename, "read"):
|
||||
fid.close()
|
||||
else:
|
||||
fid.seek(0)
|
||||
|
||||
return fs, data
|
||||
|
||||
|
||||
def write(filename, rate, data):
|
||||
"""
|
||||
Write a NumPy array as a WAV file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : string or open file handle
|
||||
Output wav file.
|
||||
rate : int
|
||||
The sample rate (in samples/sec).
|
||||
data : ndarray
|
||||
A 1-D or 2-D NumPy array of either integer or float data-type.
|
||||
|
||||
Notes
|
||||
-----
|
||||
* Writes a simple uncompressed WAV file.
|
||||
* To write multiple-channels, use a 2-D array of shape
|
||||
(Nsamples, Nchannels).
|
||||
* The bits-per-sample and PCM/float will be determined by the data-type.
|
||||
|
||||
Common data types: [1]_
|
||||
|
||||
===================== =========== =========== =============
|
||||
WAV format Min Max NumPy dtype
|
||||
===================== =========== =========== =============
|
||||
32-bit floating-point -1.0 +1.0 float32
|
||||
32-bit PCM -2147483648 +2147483647 int32
|
||||
16-bit PCM -32768 +32767 int16
|
||||
8-bit PCM 0 255 uint8
|
||||
===================== =========== =========== =============
|
||||
|
||||
Note that 8-bit PCM is unsigned.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] IBM Corporation and Microsoft Corporation, "Multimedia Programming
|
||||
Interface and Data Specifications 1.0", section "Data Format of the
|
||||
Samples", August 1991
|
||||
http://www.tactilemedia.com/info/MCI_Control_Info.html
|
||||
|
||||
Examples
|
||||
--------
|
||||
Create a 100Hz sine wave, sampled at 44100Hz.
|
||||
Write to 16-bit PCM, Mono.
|
||||
|
||||
>>> from scipy.io.wavfile import write
|
||||
>>> samplerate = 44100; fs = 100
|
||||
>>> t = np.linspace(0., 1., samplerate)
|
||||
>>> amplitude = np.iinfo(np.int16).max
|
||||
>>> data = amplitude * np.sin(2. * np.pi * fs * t)
|
||||
>>> write("example.wav", samplerate, data.astype(np.int16))
|
||||
|
||||
"""
|
||||
if hasattr(filename, "write"):
|
||||
fid = filename
|
||||
else:
|
||||
# pylint: disable=consider-using-with
|
||||
fid = open(filename, "wb")
|
||||
|
||||
fs = rate
|
||||
|
||||
try:
|
||||
dkind = data.dtype.kind
|
||||
if not (
|
||||
dkind == "i" or dkind == "f" or (dkind == "u" and data.dtype.itemsize == 1)
|
||||
):
|
||||
raise ValueError(f"Unsupported data type '{data.dtype}'")
|
||||
|
||||
header_data = b""
|
||||
|
||||
header_data += b"RIFF"
|
||||
header_data += b"\x00\x00\x00\x00"
|
||||
header_data += b"WAVE"
|
||||
|
||||
# fmt chunk
|
||||
header_data += b"fmt "
|
||||
if dkind == "f":
|
||||
format_tag = WAVE_FORMAT.IEEE_FLOAT
|
||||
else:
|
||||
format_tag = WAVE_FORMAT.PCM
|
||||
if data.ndim == 1:
|
||||
channels = 1
|
||||
else:
|
||||
channels = data.shape[1]
|
||||
bit_depth = data.dtype.itemsize * 8
|
||||
bytes_per_second = fs * (bit_depth // 8) * channels
|
||||
block_align = channels * (bit_depth // 8)
|
||||
|
||||
fmt_chunk_data = struct.pack(
|
||||
"<HHIIHH",
|
||||
format_tag,
|
||||
channels,
|
||||
fs,
|
||||
bytes_per_second,
|
||||
block_align,
|
||||
bit_depth,
|
||||
)
|
||||
if not (dkind in ("i", "u")):
|
||||
# add cbSize field for non-PCM files
|
||||
fmt_chunk_data += b"\x00\x00"
|
||||
|
||||
header_data += struct.pack("<I", len(fmt_chunk_data))
|
||||
header_data += fmt_chunk_data
|
||||
|
||||
# fact chunk (non-PCM files)
|
||||
if not (dkind in ("i", "u")):
|
||||
header_data += b"fact"
|
||||
header_data += struct.pack("<II", 4, data.shape[0])
|
||||
|
||||
# check data size (needs to be immediately before the data chunk)
|
||||
if ((len(header_data) - 4 - 4) + (4 + 4 + data.nbytes)) > 0xFFFFFFFF:
|
||||
raise ValueError("Data exceeds wave file size limit")
|
||||
|
||||
fid.write(header_data)
|
||||
|
||||
# data chunk
|
||||
fid.write(b"data")
|
||||
fid.write(struct.pack("<I", data.nbytes))
|
||||
if data.dtype.byteorder == ">" or (
|
||||
data.dtype.byteorder == "=" and sys.byteorder == "big"
|
||||
):
|
||||
data = data.byteswap()
|
||||
_array_tofile(fid, data)
|
||||
|
||||
# Determine file size and place it in correct
|
||||
# position at start of the file.
|
||||
size = fid.tell()
|
||||
fid.seek(4)
|
||||
fid.write(struct.pack("<I", size - 8))
|
||||
|
||||
finally:
|
||||
if not hasattr(filename, "write"):
|
||||
fid.close()
|
||||
else:
|
||||
fid.seek(0)
|
||||
|
||||
|
||||
def _array_tofile(fid, data):
|
||||
# ravel gives a c-contiguous buffer
|
||||
fid.write(data.ravel().view("b").data)
|
||||
111
mlu_370-piper/piper/src/python/piper_train/voice_conversion.py
Executable file
111
mlu_370-piper/piper/src/python/piper_train/voice_conversion.py
Executable file
@@ -0,0 +1,111 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import librosa
|
||||
import torch
|
||||
|
||||
from .vits.lightning import VitsModel
|
||||
from .vits.mel_processing import spectrogram_torch
|
||||
from .vits.wavfile import write as write_wav
|
||||
|
||||
_LOGGER = logging.getLogger("piper_train.voice_converstion")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("audio", nargs="+", help="Audio file(s) to convert")
|
||||
parser.add_argument("--checkpoint", required=True, help="Path to model checkpoint")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
help="Directory to write WAV file(s) (default: current directory)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--from-speaker", required=True, type=int, help="Speaker id number of source"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--to-speaker", required=True, type=int, help="Speaker id number of target"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug", action="store_true", help="Print DEBUG messages to the console"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.debug:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
_LOGGER.debug(args)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
args.checkpoint = Path(args.checkpoint)
|
||||
args.output_dir = Path(args.output_dir) if args.output_dir else Path.cwd()
|
||||
args.output_dir.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
|
||||
model_g = model.model_g
|
||||
|
||||
# Inference only
|
||||
model_g.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
model_g.dec.remove_weight_norm()
|
||||
|
||||
try:
|
||||
for audio_path_str in args.audio:
|
||||
audio_path = Path(audio_path_str)
|
||||
wav_path = args.output_dir / f"{audio_path.stem}.wav"
|
||||
|
||||
audio, _sample_rate = librosa.load(path=audio_path_str, sr=22050)
|
||||
|
||||
with torch.no_grad():
|
||||
# NOTE: audio is already in [-1, 1] coming from librosa
|
||||
audio_norm = torch.FloatTensor(audio).unsqueeze(0)
|
||||
spec = spectrogram_torch(
|
||||
y=audio_norm,
|
||||
n_fft=1024,
|
||||
sampling_rate=22050,
|
||||
hop_size=256,
|
||||
win_size=1024,
|
||||
center=False,
|
||||
).squeeze(0)
|
||||
|
||||
specs = spec.unsqueeze(0)
|
||||
spec_lengths = torch.LongTensor([specs.shape[2]])
|
||||
from_speaker = torch.LongTensor([args.from_speaker])
|
||||
to_speaker = torch.LongTensor([args.to_speaker])
|
||||
|
||||
start_time = time.perf_counter()
|
||||
audio = (
|
||||
model_g.voice_conversion(
|
||||
specs, spec_lengths, from_speaker, to_speaker
|
||||
)[0][0, 0]
|
||||
.data.cpu()
|
||||
.float()
|
||||
.numpy()
|
||||
)
|
||||
end_time = time.perf_counter()
|
||||
|
||||
_LOGGER.debug(
|
||||
"Converted audio in %s second(s) (%s, shape=%s)",
|
||||
end_time - start_time,
|
||||
audio_path.stem,
|
||||
list(audio.shape),
|
||||
)
|
||||
|
||||
write_wav(str(wav_path), 22050, audio)
|
||||
|
||||
_LOGGER.info("Wrote WAV to %s", wav_path)
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
7
mlu_370-piper/piper/src/python/requirements.txt
Normal file
7
mlu_370-piper/piper/src/python/requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
cython>=0.29.0,<1
|
||||
piper-phonemize~=1.1.0
|
||||
librosa>=0.9.2,<1
|
||||
numpy>=1.19.0
|
||||
onnxruntime>=1.11.0
|
||||
pytorch-lightning
|
||||
# torch>=1.11.0,<2
|
||||
7
mlu_370-piper/piper/src/python/requirements_dev.txt
Normal file
7
mlu_370-piper/piper/src/python/requirements_dev.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
black==22.3.0
|
||||
coverage==5.0.4
|
||||
flake8==3.7.9
|
||||
mypy==0.910
|
||||
pylint==2.10.2
|
||||
pytest==5.4.1
|
||||
pytest-cov==2.8.1
|
||||
15
mlu_370-piper/piper/src/python/run-docker
Executable file
15
mlu_370-piper/piper/src/python/run-docker
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Follow instructions here: https://docs.docker.com/config/containers/resource_constraints/#access-an-nvidia-gpu
|
||||
docker run \
|
||||
-it \
|
||||
--gpus all \
|
||||
-w "$PWD" \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--ipc=host \
|
||||
-v "${HOME}:${HOME}" \
|
||||
-v /media/cache:/media/cache:ro \
|
||||
-v /etc/hostname:/etc/hostname:ro \
|
||||
-v /etc/localtime:/etc/localtime:ro \
|
||||
larynx2-train \
|
||||
"$@"
|
||||
29
mlu_370-piper/piper/src/python/scripts/check.sh
Executable file
29
mlu_370-piper/piper/src/python/scripts/check.sh
Executable file
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Runs formatters, linters, and type checkers on Python code.
|
||||
|
||||
set -eo pipefail
|
||||
|
||||
# Directory of *this* script
|
||||
this_dir="$( cd "$( dirname "$0" )" && pwd )"
|
||||
|
||||
base_dir="$(realpath "${this_dir}/..")"
|
||||
|
||||
# Path to virtual environment
|
||||
: "${venv:=${base_dir}/.venv}"
|
||||
|
||||
if [ -d "${venv}" ]; then
|
||||
# Activate virtual environment if available
|
||||
source "${venv}/bin/activate"
|
||||
fi
|
||||
|
||||
python_files=("${base_dir}/piper_train")
|
||||
|
||||
# Format code
|
||||
black "${python_files[@]}"
|
||||
isort "${python_files[@]}"
|
||||
|
||||
# Check
|
||||
flake8 "${python_files[@]}"
|
||||
pylint "${python_files[@]}"
|
||||
mypy "${python_files[@]}"
|
||||
33
mlu_370-piper/piper/src/python/scripts/setup.sh
Executable file
33
mlu_370-piper/piper/src/python/scripts/setup.sh
Executable file
@@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env bash
|
||||
set -eo pipefail
|
||||
|
||||
# Directory of *this* script
|
||||
this_dir="$( cd "$( dirname "$0" )" && pwd )"
|
||||
|
||||
# Base directory of repo
|
||||
base_dir="$(realpath "${this_dir}/..")"
|
||||
|
||||
# Path to virtual environment
|
||||
: "${venv:=${base_dir}/.venv}"
|
||||
|
||||
# Python binary to use
|
||||
: "${PYTHON=python3}"
|
||||
|
||||
python_version="$(${PYTHON} --version)"
|
||||
|
||||
# Create virtual environment
|
||||
echo "Creating virtual environment at ${venv} (${python_version})"
|
||||
rm -rf "${venv}"
|
||||
"${PYTHON}" -m venv "${venv}"
|
||||
source "${venv}/bin/activate"
|
||||
|
||||
# Install Python dependencies
|
||||
echo 'Installing Python dependencies'
|
||||
pip3 install --upgrade pip
|
||||
pip3 install --upgrade wheel setuptools
|
||||
|
||||
pip3 install -r "${base_dir}/requirements.txt"
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
echo "OK"
|
||||
61
mlu_370-piper/piper/src/python/setup.py
Normal file
61
mlu_370-piper/piper/src/python/setup.py
Normal file
@@ -0,0 +1,61 @@
|
||||
#!/usr/bin/env python3
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import setuptools
|
||||
from setuptools import setup
|
||||
|
||||
this_dir = Path(__file__).parent
|
||||
module_dir = this_dir / "piper_train"
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
# Load README in as long description
|
||||
long_description: str = ""
|
||||
readme_path = this_dir / "README.md"
|
||||
if readme_path.is_file():
|
||||
long_description = readme_path.read_text(encoding="utf-8")
|
||||
|
||||
requirements = []
|
||||
requirements_path = this_dir / "requirements.txt"
|
||||
if requirements_path.is_file():
|
||||
with open(requirements_path, "r", encoding="utf-8") as requirements_file:
|
||||
requirements = requirements_file.read().splitlines()
|
||||
|
||||
version_path = module_dir / "VERSION"
|
||||
with open(version_path, "r", encoding="utf-8") as version_file:
|
||||
version = version_file.read().strip()
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
setup(
|
||||
name="piper_train",
|
||||
version=version,
|
||||
description="A fast and local neural text to speech system",
|
||||
long_description=long_description,
|
||||
url="http://github.com/rhasspy/piper",
|
||||
author="Michael Hansen",
|
||||
author_email="mike@rhasspy.org",
|
||||
license="MIT",
|
||||
packages=setuptools.find_packages(),
|
||||
package_data={
|
||||
"piper_train": ["VERSION", "py.typed"],
|
||||
},
|
||||
install_requires=requirements,
|
||||
extras_require={':python_version<"3.9"': ["importlib_resources"]},
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"piper-train = piper_train.__main__:main",
|
||||
]
|
||||
},
|
||||
classifiers=[
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
"Topic :: Text Processing :: Linguistic",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
],
|
||||
keywords="rhasspy tts speech voice",
|
||||
)
|
||||
3
mlu_370-piper/piper/src/python_run/.gitignore
vendored
Normal file
3
mlu_370-piper/piper/src/python_run/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
build/
|
||||
dist/
|
||||
*.egg-info/
|
||||
6
mlu_370-piper/piper/src/python_run/.isort.cfg
Normal file
6
mlu_370-piper/piper/src/python_run/.isort.cfg
Normal file
@@ -0,0 +1,6 @@
|
||||
[settings]
|
||||
multi_line_output=3
|
||||
include_trailing_comma=True
|
||||
force_grid_wrap=0
|
||||
use_parentheses=True
|
||||
line_length=88
|
||||
2
mlu_370-piper/piper/src/python_run/MANIFEST.in
Normal file
2
mlu_370-piper/piper/src/python_run/MANIFEST.in
Normal file
@@ -0,0 +1,2 @@
|
||||
include requirements.txt
|
||||
include piper/voices.json
|
||||
27
mlu_370-piper/piper/src/python_run/README_http.md
Normal file
27
mlu_370-piper/piper/src/python_run/README_http.md
Normal file
@@ -0,0 +1,27 @@
|
||||
# Piper HTTP Server
|
||||
|
||||
Install the requirements into your virtual environment:
|
||||
|
||||
```sh
|
||||
.venv/bin/pip3 install -r requirements_http.txt
|
||||
```
|
||||
|
||||
Run the web server:
|
||||
|
||||
```sh
|
||||
.venv/bin/python3 -m piper.http_server --model ...
|
||||
```
|
||||
|
||||
See `--help` for more options.
|
||||
|
||||
Using a `GET` request:
|
||||
|
||||
```sh
|
||||
curl -G --data-urlencode 'text=This is a test.' -o test.wav 'localhost:5000'
|
||||
```
|
||||
|
||||
Using a `POST` request:
|
||||
|
||||
```sh
|
||||
curl -X POST -H 'Content-Type: text/plain' --data 'This is a test.' -o test.wav 'localhost:5000'
|
||||
```
|
||||
7
mlu_370-piper/piper/src/python_run/mypy.ini
Normal file
7
mlu_370-piper/piper/src/python_run/mypy.ini
Normal file
@@ -0,0 +1,7 @@
|
||||
[mypy]
|
||||
|
||||
[mypy-onnxruntime.*]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-piper_phonemize.*]
|
||||
ignore_missing_imports = True
|
||||
5
mlu_370-piper/piper/src/python_run/piper/__init__.py
Normal file
5
mlu_370-piper/piper/src/python_run/piper/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from .voice import PiperVoice
|
||||
|
||||
__all__ = [
|
||||
"PiperVoice",
|
||||
]
|
||||
159
mlu_370-piper/piper/src/python_run/piper/__main__.py
Normal file
159
mlu_370-piper/piper/src/python_run/piper/__main__.py
Normal file
@@ -0,0 +1,159 @@
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
import wave
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
from . import PiperVoice
|
||||
from .download import ensure_voice_exists, find_voice, get_voices
|
||||
|
||||
_FILE = Path(__file__)
|
||||
_DIR = _FILE.parent
|
||||
_LOGGER = logging.getLogger(_FILE.stem)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
|
||||
parser.add_argument("-c", "--config", help="Path to model config file")
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--output-file",
|
||||
"--output_file",
|
||||
help="Path to output WAV file (default: stdout)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--output-dir",
|
||||
"--output_dir",
|
||||
help="Path to output directory (default: cwd)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-raw",
|
||||
"--output_raw",
|
||||
action="store_true",
|
||||
help="Stream raw audio to stdout",
|
||||
)
|
||||
#
|
||||
parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
|
||||
parser.add_argument(
|
||||
"--length-scale", "--length_scale", type=float, help="Phoneme length"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--noise-scale", "--noise_scale", type=float, help="Generator noise"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--noise-w", "--noise_w", type=float, help="Phoneme width noise"
|
||||
)
|
||||
#
|
||||
parser.add_argument("--cuda", action="store_true", help="Use GPU")
|
||||
#
|
||||
parser.add_argument(
|
||||
"--sentence-silence",
|
||||
"--sentence_silence",
|
||||
type=float,
|
||||
default=0.0,
|
||||
help="Seconds of silence after each sentence",
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--data-dir",
|
||||
"--data_dir",
|
||||
action="append",
|
||||
default=[str(Path.cwd())],
|
||||
help="Data directory to check for downloaded models (default: current directory)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--download-dir",
|
||||
"--download_dir",
|
||||
help="Directory to download voices into (default: first data dir)",
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--update-voices",
|
||||
action="store_true",
|
||||
help="Download latest voices.json during startup",
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--debug", action="store_true", help="Print DEBUG messages to console"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
|
||||
_LOGGER.debug(args)
|
||||
|
||||
if not args.download_dir:
|
||||
# Download to first data directory by default
|
||||
args.download_dir = args.data_dir[0]
|
||||
|
||||
# Download voice if file doesn't exist
|
||||
model_path = Path(args.model)
|
||||
if not model_path.exists():
|
||||
# Load voice info
|
||||
voices_info = get_voices(args.download_dir, update_voices=args.update_voices)
|
||||
|
||||
# Resolve aliases for backwards compatibility with old voice names
|
||||
aliases_info: Dict[str, Any] = {}
|
||||
for voice_info in voices_info.values():
|
||||
for voice_alias in voice_info.get("aliases", []):
|
||||
aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
|
||||
|
||||
voices_info.update(aliases_info)
|
||||
ensure_voice_exists(args.model, args.data_dir, args.download_dir, voices_info)
|
||||
args.model, args.config = find_voice(args.model, args.data_dir)
|
||||
|
||||
# Load voice
|
||||
voice = PiperVoice.load(args.model, config_path=args.config, use_cuda=args.cuda)
|
||||
synthesize_args = {
|
||||
"speaker_id": args.speaker,
|
||||
"length_scale": args.length_scale,
|
||||
"noise_scale": args.noise_scale,
|
||||
"noise_w": args.noise_w,
|
||||
"sentence_silence": args.sentence_silence,
|
||||
}
|
||||
|
||||
if args.output_raw:
|
||||
# Read line-by-line
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Write raw audio to stdout as its produced
|
||||
audio_stream = voice.synthesize_stream_raw(line, **synthesize_args)
|
||||
for audio_bytes in audio_stream:
|
||||
sys.stdout.buffer.write(audio_bytes)
|
||||
sys.stdout.buffer.flush()
|
||||
elif args.output_dir:
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read line-by-line
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
wav_path = output_dir / f"{time.monotonic_ns()}.wav"
|
||||
with wave.open(str(wav_path), "wb") as wav_file:
|
||||
voice.synthesize(line, wav_file, **synthesize_args)
|
||||
|
||||
_LOGGER.info("Wrote %s", wav_path)
|
||||
else:
|
||||
# Read entire input
|
||||
text = sys.stdin.read()
|
||||
|
||||
if (not args.output_file) or (args.output_file == "-"):
|
||||
# Write to stdout
|
||||
with wave.open(sys.stdout.buffer, "wb") as wav_file:
|
||||
voice.synthesize(text, wav_file, **synthesize_args)
|
||||
else:
|
||||
# Write to file
|
||||
with wave.open(args.output_file, "wb") as wav_file:
|
||||
voice.synthesize(text, wav_file, **synthesize_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
53
mlu_370-piper/piper/src/python_run/piper/config.py
Normal file
53
mlu_370-piper/piper/src/python_run/piper/config.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""Piper configuration"""
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, Mapping, Sequence
|
||||
|
||||
|
||||
class PhonemeType(str, Enum):
|
||||
ESPEAK = "espeak"
|
||||
TEXT = "text"
|
||||
|
||||
|
||||
@dataclass
|
||||
class PiperConfig:
|
||||
"""Piper configuration"""
|
||||
|
||||
num_symbols: int
|
||||
"""Number of phonemes"""
|
||||
|
||||
num_speakers: int
|
||||
"""Number of speakers"""
|
||||
|
||||
sample_rate: int
|
||||
"""Sample rate of output audio"""
|
||||
|
||||
espeak_voice: str
|
||||
"""Name of espeak-ng voice or alphabet"""
|
||||
|
||||
length_scale: float
|
||||
noise_scale: float
|
||||
noise_w: float
|
||||
|
||||
phoneme_id_map: Mapping[str, Sequence[int]]
|
||||
"""Phoneme -> [id,]"""
|
||||
|
||||
phoneme_type: PhonemeType
|
||||
"""espeak or text"""
|
||||
|
||||
@staticmethod
|
||||
def from_dict(config: Dict[str, Any]) -> "PiperConfig":
|
||||
inference = config.get("inference", {})
|
||||
|
||||
return PiperConfig(
|
||||
num_symbols=config["num_symbols"],
|
||||
num_speakers=config["num_speakers"],
|
||||
sample_rate=config["audio"]["sample_rate"],
|
||||
noise_scale=inference.get("noise_scale", 0.667),
|
||||
length_scale=inference.get("length_scale", 1.0),
|
||||
noise_w=inference.get("noise_w", 0.8),
|
||||
#
|
||||
espeak_voice=config["espeak"]["voice"],
|
||||
phoneme_id_map=config["phoneme_id_map"],
|
||||
phoneme_type=PhonemeType(config.get("phoneme_type", PhonemeType.ESPEAK)),
|
||||
)
|
||||
5
mlu_370-piper/piper/src/python_run/piper/const.py
Normal file
5
mlu_370-piper/piper/src/python_run/piper/const.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Constants"""
|
||||
|
||||
PAD = "_" # padding (0)
|
||||
BOS = "^" # beginning of sentence
|
||||
EOS = "$" # end of sentence
|
||||
139
mlu_370-piper/piper/src/python_run/piper/download.py
Executable file
139
mlu_370-piper/piper/src/python_run/piper/download.py
Executable file
@@ -0,0 +1,139 @@
|
||||
"""Utility for downloading Piper voices."""
|
||||
import json
|
||||
import logging
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, Set, Tuple, Union
|
||||
from urllib.request import urlopen
|
||||
|
||||
from .file_hash import get_file_hash
|
||||
|
||||
URL_FORMAT = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/{file}"
|
||||
|
||||
_DIR = Path(__file__).parent
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
_SKIP_FILES = {"MODEL_CARD"}
|
||||
|
||||
|
||||
class VoiceNotFoundError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def get_voices(
|
||||
download_dir: Union[str, Path], update_voices: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""Loads available voices from downloaded or embedded JSON file."""
|
||||
download_dir = Path(download_dir)
|
||||
voices_download = download_dir / "voices.json"
|
||||
|
||||
if update_voices:
|
||||
# Download latest voices.json
|
||||
voices_url = URL_FORMAT.format(file="voices.json")
|
||||
_LOGGER.debug("Downloading %s to %s", voices_url, voices_download)
|
||||
with urlopen(voices_url) as response, open(
|
||||
voices_download, "wb"
|
||||
) as download_file:
|
||||
shutil.copyfileobj(response, download_file)
|
||||
|
||||
# Prefer downloaded file to embedded
|
||||
voices_embedded = _DIR / "voices.json"
|
||||
voices_path = voices_download if voices_download.exists() else voices_embedded
|
||||
|
||||
_LOGGER.debug("Loading %s", voices_path)
|
||||
with open(voices_path, "r", encoding="utf-8") as voices_file:
|
||||
return json.load(voices_file)
|
||||
|
||||
|
||||
def ensure_voice_exists(
|
||||
name: str,
|
||||
data_dirs: Iterable[Union[str, Path]],
|
||||
download_dir: Union[str, Path],
|
||||
voices_info: Dict[str, Any],
|
||||
):
|
||||
assert data_dirs, "No data dirs"
|
||||
if name not in voices_info:
|
||||
raise VoiceNotFoundError(name)
|
||||
|
||||
voice_info = voices_info[name]
|
||||
voice_files = voice_info["files"]
|
||||
files_to_download: Set[str] = set()
|
||||
|
||||
for data_dir in data_dirs:
|
||||
data_dir = Path(data_dir)
|
||||
|
||||
# Check sizes/hashes
|
||||
for file_path, file_info in voice_files.items():
|
||||
if file_path in files_to_download:
|
||||
# Already planning to download
|
||||
continue
|
||||
|
||||
file_name = Path(file_path).name
|
||||
if file_name in _SKIP_FILES:
|
||||
continue
|
||||
|
||||
data_file_path = data_dir / file_name
|
||||
_LOGGER.debug("Checking %s", data_file_path)
|
||||
if not data_file_path.exists():
|
||||
_LOGGER.debug("Missing %s", data_file_path)
|
||||
files_to_download.add(file_path)
|
||||
continue
|
||||
|
||||
expected_size = file_info["size_bytes"]
|
||||
actual_size = data_file_path.stat().st_size
|
||||
if expected_size != actual_size:
|
||||
_LOGGER.warning(
|
||||
"Wrong size (expected=%s, actual=%s) for %s",
|
||||
expected_size,
|
||||
actual_size,
|
||||
data_file_path,
|
||||
)
|
||||
files_to_download.add(file_path)
|
||||
continue
|
||||
|
||||
expected_hash = file_info["md5_digest"]
|
||||
actual_hash = get_file_hash(data_file_path)
|
||||
if expected_hash != actual_hash:
|
||||
_LOGGER.warning(
|
||||
"Wrong hash (expected=%s, actual=%s) for %s",
|
||||
expected_hash,
|
||||
actual_hash,
|
||||
data_file_path,
|
||||
)
|
||||
files_to_download.add(file_path)
|
||||
continue
|
||||
|
||||
if (not voice_files) and (not files_to_download):
|
||||
raise ValueError(f"Unable to find or download voice: {name}")
|
||||
|
||||
# Download missing files
|
||||
download_dir = Path(download_dir)
|
||||
|
||||
for file_path in files_to_download:
|
||||
file_name = Path(file_path).name
|
||||
if file_name in _SKIP_FILES:
|
||||
continue
|
||||
|
||||
file_url = URL_FORMAT.format(file=file_path)
|
||||
download_file_path = download_dir / file_name
|
||||
download_file_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
_LOGGER.debug("Downloading %s to %s", file_url, download_file_path)
|
||||
with urlopen(file_url) as response, open(
|
||||
download_file_path, "wb"
|
||||
) as download_file:
|
||||
shutil.copyfileobj(response, download_file)
|
||||
|
||||
_LOGGER.info("Downloaded %s (%s)", download_file_path, file_url)
|
||||
|
||||
|
||||
def find_voice(name: str, data_dirs: Iterable[Union[str, Path]]) -> Tuple[Path, Path]:
|
||||
for data_dir in data_dirs:
|
||||
data_dir = Path(data_dir)
|
||||
onnx_path = data_dir / f"{name}.onnx"
|
||||
config_path = data_dir / f"{name}.onnx.json"
|
||||
|
||||
if onnx_path.exists() and config_path.exists():
|
||||
return onnx_path, config_path
|
||||
|
||||
raise ValueError(f"Missing files for voice {name}")
|
||||
46
mlu_370-piper/piper/src/python_run/piper/file_hash.py
Normal file
46
mlu_370-piper/piper/src/python_run/piper/file_hash.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
|
||||
def get_file_hash(path: Union[str, Path], bytes_per_chunk: int = 8192) -> str:
|
||||
"""Hash a file in chunks using md5."""
|
||||
path_hash = hashlib.md5()
|
||||
with open(path, "rb") as path_file:
|
||||
chunk = path_file.read(bytes_per_chunk)
|
||||
while chunk:
|
||||
path_hash.update(chunk)
|
||||
chunk = path_file.read(bytes_per_chunk)
|
||||
|
||||
return path_hash.hexdigest()
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("file", nargs="+")
|
||||
parser.add_argument("--dir", help="Parent directory")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.dir:
|
||||
args.dir = Path(args.dir)
|
||||
|
||||
hashes = {}
|
||||
for path_str in args.file:
|
||||
path = Path(path_str)
|
||||
path_hash = get_file_hash(path)
|
||||
if args.dir:
|
||||
path = path.relative_to(args.dir)
|
||||
|
||||
hashes[str(path)] = path_hash
|
||||
|
||||
json.dump(hashes, sys.stdout)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
127
mlu_370-piper/piper/src/python_run/piper/http_server.py
Normal file
127
mlu_370-piper/piper/src/python_run/piper/http_server.py
Normal file
@@ -0,0 +1,127 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import io
|
||||
import logging
|
||||
import wave
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
from flask import Flask, request
|
||||
|
||||
from . import PiperVoice
|
||||
from .download import ensure_voice_exists, find_voice, get_voices
|
||||
|
||||
_LOGGER = logging.getLogger()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", default="0.0.0.0", help="HTTP server host")
|
||||
parser.add_argument("--port", type=int, default=5000, help="HTTP server port")
|
||||
#
|
||||
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
|
||||
parser.add_argument("-c", "--config", help="Path to model config file")
|
||||
#
|
||||
parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
|
||||
parser.add_argument(
|
||||
"--length-scale", "--length_scale", type=float, help="Phoneme length"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--noise-scale", "--noise_scale", type=float, help="Generator noise"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--noise-w", "--noise_w", type=float, help="Phoneme width noise"
|
||||
)
|
||||
#
|
||||
parser.add_argument("--cuda", action="store_true", help="Use GPU")
|
||||
#
|
||||
parser.add_argument(
|
||||
"--sentence-silence",
|
||||
"--sentence_silence",
|
||||
type=float,
|
||||
default=0.0,
|
||||
help="Seconds of silence after each sentence",
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--data-dir",
|
||||
"--data_dir",
|
||||
action="append",
|
||||
default=[str(Path.cwd())],
|
||||
help="Data directory to check for downloaded models (default: current directory)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--download-dir",
|
||||
"--download_dir",
|
||||
help="Directory to download voices into (default: first data dir)",
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--update-voices",
|
||||
action="store_true",
|
||||
help="Download latest voices.json during startup",
|
||||
)
|
||||
#
|
||||
parser.add_argument(
|
||||
"--debug", action="store_true", help="Print DEBUG messages to console"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
|
||||
_LOGGER.debug(args)
|
||||
|
||||
if not args.download_dir:
|
||||
# Download to first data directory by default
|
||||
args.download_dir = args.data_dir[0]
|
||||
|
||||
# Download voice if file doesn't exist
|
||||
model_path = Path(args.model)
|
||||
if not model_path.exists():
|
||||
# Load voice info
|
||||
voices_info = get_voices(args.download_dir, update_voices=args.update_voices)
|
||||
|
||||
# Resolve aliases for backwards compatibility with old voice names
|
||||
aliases_info: Dict[str, Any] = {}
|
||||
for voice_info in voices_info.values():
|
||||
for voice_alias in voice_info.get("aliases", []):
|
||||
aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
|
||||
|
||||
voices_info.update(aliases_info)
|
||||
ensure_voice_exists(args.model, args.data_dir, args.download_dir, voices_info)
|
||||
args.model, args.config = find_voice(args.model, args.data_dir)
|
||||
|
||||
# Load voice
|
||||
voice = PiperVoice.load(args.model, config_path=args.config, use_cuda=args.cuda)
|
||||
synthesize_args = {
|
||||
"speaker_id": args.speaker,
|
||||
"length_scale": args.length_scale,
|
||||
"noise_scale": args.noise_scale,
|
||||
"noise_w": args.noise_w,
|
||||
"sentence_silence": args.sentence_silence,
|
||||
}
|
||||
|
||||
# Create web server
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route("/", methods=["GET", "POST"])
|
||||
def app_synthesize() -> bytes:
|
||||
if request.method == "POST":
|
||||
text = request.data.decode("utf-8")
|
||||
else:
|
||||
text = request.args.get("text", "")
|
||||
|
||||
text = text.strip()
|
||||
if not text:
|
||||
raise ValueError("No text provided")
|
||||
|
||||
_LOGGER.debug("Synthesizing text: %s", text)
|
||||
with io.BytesIO() as wav_io:
|
||||
with wave.open(wav_io, "wb") as wav_file:
|
||||
voice.synthesize(text, wav_file, **synthesize_args)
|
||||
|
||||
return wav_io.getvalue()
|
||||
|
||||
app.run(host=args.host, port=args.port)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
12
mlu_370-piper/piper/src/python_run/piper/util.py
Normal file
12
mlu_370-piper/piper/src/python_run/piper/util.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""Utilities"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
def audio_float_to_int16(
|
||||
audio: np.ndarray, max_wav_value: float = 32767.0
|
||||
) -> np.ndarray:
|
||||
"""Normalize audio and convert to int16 range"""
|
||||
audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
|
||||
audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
|
||||
audio_norm = audio_norm.astype("int16")
|
||||
return audio_norm
|
||||
185
mlu_370-piper/piper/src/python_run/piper/voice.py
Normal file
185
mlu_370-piper/piper/src/python_run/piper/voice.py
Normal file
@@ -0,0 +1,185 @@
|
||||
import json
|
||||
import logging
|
||||
import wave
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import onnxruntime
|
||||
from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run
|
||||
|
||||
from .config import PhonemeType, PiperConfig
|
||||
from .const import BOS, EOS, PAD
|
||||
from .util import audio_float_to_int16
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PiperVoice:
|
||||
session: onnxruntime.InferenceSession
|
||||
config: PiperConfig
|
||||
|
||||
@staticmethod
|
||||
def load(
|
||||
model_path: Union[str, Path],
|
||||
config_path: Optional[Union[str, Path]] = None,
|
||||
use_cuda: bool = False,
|
||||
) -> "PiperVoice":
|
||||
"""Load an ONNX model and config."""
|
||||
if config_path is None:
|
||||
config_path = f"{model_path}.json"
|
||||
|
||||
with open(config_path, "r", encoding="utf-8") as config_file:
|
||||
config_dict = json.load(config_file)
|
||||
|
||||
providers: List[Union[str, Tuple[str, Dict[str, Any]]]]
|
||||
if use_cuda:
|
||||
providers = [
|
||||
(
|
||||
"CUDAExecutionProvider",
|
||||
{"cudnn_conv_algo_search": "HEURISTIC"},
|
||||
)
|
||||
]
|
||||
else:
|
||||
providers = ["CPUExecutionProvider"]
|
||||
|
||||
return PiperVoice(
|
||||
config=PiperConfig.from_dict(config_dict),
|
||||
session=onnxruntime.InferenceSession(
|
||||
str(model_path),
|
||||
sess_options=onnxruntime.SessionOptions(),
|
||||
providers=providers,
|
||||
),
|
||||
)
|
||||
|
||||
def phonemize(self, text: str) -> List[List[str]]:
|
||||
"""Text to phonemes grouped by sentence."""
|
||||
if self.config.phoneme_type == PhonemeType.ESPEAK:
|
||||
if self.config.espeak_voice == "ar":
|
||||
# Arabic diacritization
|
||||
# https://github.com/mush42/libtashkeel/
|
||||
text = tashkeel_run(text)
|
||||
|
||||
return phonemize_espeak(text, self.config.espeak_voice)
|
||||
|
||||
if self.config.phoneme_type == PhonemeType.TEXT:
|
||||
return phonemize_codepoints(text)
|
||||
|
||||
raise ValueError(f"Unexpected phoneme type: {self.config.phoneme_type}")
|
||||
|
||||
def phonemes_to_ids(self, phonemes: List[str]) -> List[int]:
|
||||
"""Phonemes to ids."""
|
||||
id_map = self.config.phoneme_id_map
|
||||
ids: List[int] = list(id_map[BOS])
|
||||
|
||||
for phoneme in phonemes:
|
||||
if phoneme not in id_map:
|
||||
_LOGGER.warning("Missing phoneme from id map: %s", phoneme)
|
||||
continue
|
||||
|
||||
ids.extend(id_map[phoneme])
|
||||
ids.extend(id_map[PAD])
|
||||
|
||||
ids.extend(id_map[EOS])
|
||||
|
||||
return ids
|
||||
|
||||
def synthesize(
|
||||
self,
|
||||
text: str,
|
||||
wav_file: wave.Wave_write,
|
||||
speaker_id: Optional[int] = None,
|
||||
length_scale: Optional[float] = None,
|
||||
noise_scale: Optional[float] = None,
|
||||
noise_w: Optional[float] = None,
|
||||
sentence_silence: float = 0.0,
|
||||
):
|
||||
"""Synthesize WAV audio from text."""
|
||||
wav_file.setframerate(self.config.sample_rate)
|
||||
wav_file.setsampwidth(2) # 16-bit
|
||||
wav_file.setnchannels(1) # mono
|
||||
|
||||
for audio_bytes in self.synthesize_stream_raw(
|
||||
text,
|
||||
speaker_id=speaker_id,
|
||||
length_scale=length_scale,
|
||||
noise_scale=noise_scale,
|
||||
noise_w=noise_w,
|
||||
sentence_silence=sentence_silence,
|
||||
):
|
||||
wav_file.writeframes(audio_bytes)
|
||||
|
||||
def synthesize_stream_raw(
|
||||
self,
|
||||
text: str,
|
||||
speaker_id: Optional[int] = None,
|
||||
length_scale: Optional[float] = None,
|
||||
noise_scale: Optional[float] = None,
|
||||
noise_w: Optional[float] = None,
|
||||
sentence_silence: float = 0.0,
|
||||
) -> Iterable[bytes]:
|
||||
"""Synthesize raw audio per sentence from text."""
|
||||
sentence_phonemes = self.phonemize(text)
|
||||
|
||||
# 16-bit mono
|
||||
num_silence_samples = int(sentence_silence * self.config.sample_rate)
|
||||
silence_bytes = bytes(num_silence_samples * 2)
|
||||
|
||||
for phonemes in sentence_phonemes:
|
||||
phoneme_ids = self.phonemes_to_ids(phonemes)
|
||||
yield self.synthesize_ids_to_raw(
|
||||
phoneme_ids,
|
||||
speaker_id=speaker_id,
|
||||
length_scale=length_scale,
|
||||
noise_scale=noise_scale,
|
||||
noise_w=noise_w,
|
||||
) + silence_bytes
|
||||
|
||||
def synthesize_ids_to_raw(
|
||||
self,
|
||||
phoneme_ids: List[int],
|
||||
speaker_id: Optional[int] = None,
|
||||
length_scale: Optional[float] = None,
|
||||
noise_scale: Optional[float] = None,
|
||||
noise_w: Optional[float] = None,
|
||||
) -> bytes:
|
||||
"""Synthesize raw audio from phoneme ids."""
|
||||
if length_scale is None:
|
||||
length_scale = self.config.length_scale
|
||||
|
||||
if noise_scale is None:
|
||||
noise_scale = self.config.noise_scale
|
||||
|
||||
if noise_w is None:
|
||||
noise_w = self.config.noise_w
|
||||
|
||||
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
||||
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
|
||||
scales = np.array(
|
||||
[noise_scale, length_scale, noise_w],
|
||||
dtype=np.float32,
|
||||
)
|
||||
|
||||
args = {
|
||||
"input": phoneme_ids_array,
|
||||
"input_lengths": phoneme_ids_lengths,
|
||||
"scales": scales
|
||||
}
|
||||
|
||||
if self.config.num_speakers <= 1:
|
||||
speaker_id = None
|
||||
|
||||
if (self.config.num_speakers > 1) and (speaker_id is None):
|
||||
# Default speaker
|
||||
speaker_id = 0
|
||||
|
||||
if speaker_id is not None:
|
||||
sid = np.array([speaker_id], dtype=np.int64)
|
||||
args["sid"] = sid
|
||||
|
||||
# Synthesize through Onnx
|
||||
audio = self.session.run(None, args, )[0].squeeze((0, 1))
|
||||
audio = audio_float_to_int16(audio.squeeze())
|
||||
return audio.tobytes()
|
||||
4222
mlu_370-piper/piper/src/python_run/piper/voices.json
Normal file
4222
mlu_370-piper/piper/src/python_run/piper/voices.json
Normal file
File diff suppressed because it is too large
Load Diff
0
mlu_370-piper/piper/src/python_run/py.typed
Normal file
0
mlu_370-piper/piper/src/python_run/py.typed
Normal file
37
mlu_370-piper/piper/src/python_run/pylintrc
Normal file
37
mlu_370-piper/piper/src/python_run/pylintrc
Normal file
@@ -0,0 +1,37 @@
|
||||
[MESSAGES CONTROL]
|
||||
disable=
|
||||
format,
|
||||
abstract-method,
|
||||
cyclic-import,
|
||||
duplicate-code,
|
||||
global-statement,
|
||||
import-outside-toplevel,
|
||||
inconsistent-return-statements,
|
||||
locally-disabled,
|
||||
not-context-manager,
|
||||
too-few-public-methods,
|
||||
too-many-arguments,
|
||||
too-many-branches,
|
||||
too-many-instance-attributes,
|
||||
too-many-lines,
|
||||
too-many-locals,
|
||||
too-many-public-methods,
|
||||
too-many-return-statements,
|
||||
too-many-statements,
|
||||
too-many-boolean-expressions,
|
||||
unnecessary-pass,
|
||||
unused-argument,
|
||||
broad-except,
|
||||
too-many-nested-blocks,
|
||||
invalid-name,
|
||||
unused-import,
|
||||
fixme,
|
||||
useless-super-delegation,
|
||||
missing-module-docstring,
|
||||
missing-class-docstring,
|
||||
missing-function-docstring,
|
||||
import-error,
|
||||
relative-beyond-top-level
|
||||
|
||||
[FORMAT]
|
||||
expected-line-ending-format=LF
|
||||
2
mlu_370-piper/piper/src/python_run/requirements.txt
Normal file
2
mlu_370-piper/piper/src/python_run/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
piper-phonemize~=1.1.0
|
||||
onnxruntime>=1.11.0,<2
|
||||
5
mlu_370-piper/piper/src/python_run/requirements_dev.txt
Normal file
5
mlu_370-piper/piper/src/python_run/requirements_dev.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
black==22.12.0
|
||||
flake8==6.0.0
|
||||
isort==5.11.3
|
||||
mypy==0.991
|
||||
pylint==2.15.9
|
||||
1
mlu_370-piper/piper/src/python_run/requirements_gpu.txt
Normal file
1
mlu_370-piper/piper/src/python_run/requirements_gpu.txt
Normal file
@@ -0,0 +1 @@
|
||||
onnxruntime-gpu>=1.11.0,<2
|
||||
1
mlu_370-piper/piper/src/python_run/requirements_http.txt
Normal file
1
mlu_370-piper/piper/src/python_run/requirements_http.txt
Normal file
@@ -0,0 +1 @@
|
||||
flask>=3,<4
|
||||
13
mlu_370-piper/piper/src/python_run/script/format
Executable file
13
mlu_370-piper/piper/src/python_run/script/format
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import venv
|
||||
from pathlib import Path
|
||||
|
||||
_DIR = Path(__file__).parent
|
||||
_PROGRAM_DIR = _DIR.parent
|
||||
_VENV_DIR = _PROGRAM_DIR / ".venv"
|
||||
_MODULE_DIR = _PROGRAM_DIR / "piper"
|
||||
|
||||
context = venv.EnvBuilder().ensure_directories(_VENV_DIR)
|
||||
subprocess.check_call([context.env_exe, "-m", "black", str(_MODULE_DIR)])
|
||||
subprocess.check_call([context.env_exe, "-m", "isort", str(_MODULE_DIR)])
|
||||
16
mlu_370-piper/piper/src/python_run/script/lint
Executable file
16
mlu_370-piper/piper/src/python_run/script/lint
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import venv
|
||||
from pathlib import Path
|
||||
|
||||
_DIR = Path(__file__).parent
|
||||
_PROGRAM_DIR = _DIR.parent
|
||||
_VENV_DIR = _PROGRAM_DIR / ".venv"
|
||||
_MODULE_DIR = _PROGRAM_DIR / "piper"
|
||||
|
||||
context = venv.EnvBuilder().ensure_directories(_VENV_DIR)
|
||||
subprocess.check_call([context.env_exe, "-m", "black", str(_MODULE_DIR), "--check"])
|
||||
subprocess.check_call([context.env_exe, "-m", "isort", str(_MODULE_DIR), "--check"])
|
||||
subprocess.check_call([context.env_exe, "-m", "flake8", str(_MODULE_DIR)])
|
||||
subprocess.check_call([context.env_exe, "-m", "pylint", str(_MODULE_DIR)])
|
||||
subprocess.check_call([context.env_exe, "-m", "mypy", str(_MODULE_DIR)])
|
||||
12
mlu_370-piper/piper/src/python_run/script/piper
Executable file
12
mlu_370-piper/piper/src/python_run/script/piper
Executable file
@@ -0,0 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import subprocess
|
||||
import venv
|
||||
from pathlib import Path
|
||||
|
||||
_DIR = Path(__file__).parent
|
||||
_PROGRAM_DIR = _DIR.parent
|
||||
_VENV_DIR = _PROGRAM_DIR / ".venv"
|
||||
|
||||
context = venv.EnvBuilder().ensure_directories(_VENV_DIR)
|
||||
subprocess.check_call([context.env_exe, "-m", "piper"] + sys.argv[1:])
|
||||
31
mlu_370-piper/piper/src/python_run/script/setup
Executable file
31
mlu_370-piper/piper/src/python_run/script/setup
Executable file
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import venv
|
||||
from pathlib import Path
|
||||
|
||||
_DIR = Path(__file__).parent
|
||||
_PROGRAM_DIR = _DIR.parent
|
||||
_VENV_DIR = _PROGRAM_DIR / ".venv"
|
||||
|
||||
|
||||
# Create virtual environment
|
||||
builder = venv.EnvBuilder(with_pip=True)
|
||||
context = builder.ensure_directories(_VENV_DIR)
|
||||
builder.create(_VENV_DIR)
|
||||
|
||||
# Upgrade dependencies
|
||||
pip = [context.env_exe, "-m", "pip"]
|
||||
subprocess.check_call(pip + ["install", "--upgrade", "pip"])
|
||||
subprocess.check_call(pip + ["install", "--upgrade", "setuptools", "wheel"])
|
||||
|
||||
# Install requirements
|
||||
subprocess.check_call(
|
||||
pip
|
||||
+ [
|
||||
"install",
|
||||
"-f",
|
||||
"https://synesthesiam.github.io/prebuilt-apps/",
|
||||
"-r",
|
||||
str(_PROGRAM_DIR / "requirements.txt"),
|
||||
]
|
||||
)
|
||||
22
mlu_370-piper/piper/src/python_run/setup.cfg
Normal file
22
mlu_370-piper/piper/src/python_run/setup.cfg
Normal file
@@ -0,0 +1,22 @@
|
||||
[flake8]
|
||||
# To work with Black
|
||||
max-line-length = 88
|
||||
# E501: line too long
|
||||
# W503: Line break occurred before a binary operator
|
||||
# E203: Whitespace before ':'
|
||||
# D202 No blank lines allowed after function docstring
|
||||
# W504 line break after binary operator
|
||||
ignore =
|
||||
E501,
|
||||
W503,
|
||||
E203,
|
||||
D202,
|
||||
W504
|
||||
|
||||
[isort]
|
||||
multi_line_output = 3
|
||||
include_trailing_comma=True
|
||||
force_grid_wrap=0
|
||||
use_parentheses=True
|
||||
line_length=88
|
||||
indent = " "
|
||||
48
mlu_370-piper/piper/src/python_run/setup.py
Normal file
48
mlu_370-piper/piper/src/python_run/setup.py
Normal file
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env python3
|
||||
from pathlib import Path
|
||||
|
||||
import setuptools
|
||||
from setuptools import setup
|
||||
|
||||
this_dir = Path(__file__).parent
|
||||
module_dir = this_dir / "piper"
|
||||
|
||||
requirements = []
|
||||
requirements_path = this_dir / "requirements.txt"
|
||||
if requirements_path.is_file():
|
||||
with open(requirements_path, "r", encoding="utf-8") as requirements_file:
|
||||
requirements = requirements_file.read().splitlines()
|
||||
|
||||
data_files = [module_dir / "voices.json"]
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
setup(
|
||||
name="piper-tts",
|
||||
version="1.2.0",
|
||||
description="A fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4.",
|
||||
url="http://github.com/rhasspy/piper",
|
||||
author="Michael Hansen",
|
||||
author_email="mike@rhasspy.org",
|
||||
license="MIT",
|
||||
packages=setuptools.find_packages(),
|
||||
package_data={"piper": [str(p.relative_to(module_dir)) for p in data_files]},
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"piper = piper.__main__:main",
|
||||
]
|
||||
},
|
||||
install_requires=requirements,
|
||||
extras_require={"gpu": ["onnxruntime-gpu>=1.11.0,<2"], "http": ["flask>=3,<4"]},
|
||||
classifiers=[
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
"Topic :: Text Processing :: Linguistic",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
],
|
||||
keywords="rhasspy piper tts",
|
||||
)
|
||||
Reference in New Issue
Block a user