update README

2025-09-10 10:47:02 +08:00
parent 5088f0b50a
commit ff78032400
603 changed files with 21 additions and 23 deletions
--- a/mlu_370-piper/piper/src/benchmark/benchmark_generator.py
+++ b/mlu_370-piper/piper/src/benchmark/benchmark_generator.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+import json
+import time
+import statistics
+import sys
+
+import torch
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-m", "--model", required=True, help="Path to generator file (.pt)"
+    )
+    parser.add_argument("-c", "--config", help="Path to model config file (.json)")
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.DEBUG)
+
+    if not args.config:
+        args.config = f"{args.model}.json"
+
+    with open(args.config, "r", encoding="utf-8") as config_file:
+        config = json.load(config_file)
+
+    sample_rate = config["audio"]["sample_rate"]
+    utterances = [json.loads(line) for line in sys.stdin]
+
+    start_time = time.monotonic_ns()
+    model = torch.load(args.model)
+    end_time = time.monotonic_ns()
+
+    model.eval()
+
+    load_sec = (end_time - start_time) / 1e9
+    synthesize_rtf = []
+    for utterance in utterances:
+        phoneme_ids = utterance["phoneme_ids"]
+        speaker_id = utterance.get("speaker_id")
+        synthesize_rtf.append(
+            synthesize(
+                model,
+                phoneme_ids,
+                speaker_id,
+                sample_rate,
+            )
+        )
+
+    json.dump(
+        {
+            "load_sec": load_sec,
+            "rtf_mean": statistics.mean(synthesize_rtf),
+            "rtf_stdev": statistics.stdev(synthesize_rtf),
+            "synthesize_rtf": synthesize_rtf,
+        },
+        sys.stdout,
+    )
+
+
+def synthesize(model, phoneme_ids, speaker_id, sample_rate) -> float:
+    text = torch.LongTensor(phoneme_ids).unsqueeze(0)
+    text_lengths = torch.LongTensor([len(phoneme_ids)])
+    sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
+
+    start_time = time.monotonic_ns()
+    audio = (
+        model(
+            text,
+            text_lengths,
+            sid,
+        )[0]
+        .detach()
+        .numpy()
+        .squeeze()
+    )
+    end_time = time.monotonic_ns()
+
+    audio_sec = len(audio) / sample_rate
+    infer_sec = (end_time - start_time) / 1e9
+    rtf = infer_sec / audio_sec
+
+    _LOGGER.debug(
+        "Real-time factor: %s (infer=%s sec, audio=%s sec)",
+        rtf,
+        infer_sec,
+        audio_sec,
+    )
+
+    return rtf
+
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/benchmark/benchmark_onnx.py
+++ b/mlu_370-piper/piper/src/benchmark/benchmark_onnx.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+import json
+import time
+import statistics
+import sys
+
+import onnxruntime
+import numpy as np
+
+_NOISE_SCALE = 0.667
+_LENGTH_SCALE = 1.0
+_NOISE_W = 0.8
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-m", "--model", required=True, help="Path to Onnx model file (.onnx)"
+    )
+    parser.add_argument("-c", "--config", help="Path to model config file (.json)")
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.DEBUG)
+
+    if not args.config:
+        args.config = f"{args.model}.json"
+
+    with open(args.config, "r", encoding="utf-8") as config_file:
+        config = json.load(config_file)
+
+    sample_rate = config["audio"]["sample_rate"]
+    utterances = [json.loads(line) for line in sys.stdin]
+
+    start_time = time.monotonic_ns()
+
+    session_options = onnxruntime.SessionOptions()
+    session_options.graph_optimization_level = (
+        onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
+    )
+    # session_options.enable_cpu_mem_arena = False
+    # session_options.enable_mem_pattern = False
+    session_options.enable_mem_reuse = False
+    # session_options.enable_profiling = False
+    # session_options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL
+    # session_options.execution_order = onnxruntime.ExecutionOrder.PRIORITY_BASED
+
+    session = onnxruntime.InferenceSession(
+        args.model,
+        sess_options=session_options,
+    )
+    # session.intra_op_num_threads = 1
+    # session.inter_op_num_threads = 1
+
+    end_time = time.monotonic_ns()
+
+    load_sec = (end_time - start_time) / 1e9
+    synthesize_rtf = []
+    for utterance in utterances:
+        phoneme_ids = utterance["phoneme_ids"]
+        speaker_id = utterance.get("speaker_id")
+        synthesize_rtf.append(
+            synthesize(
+                session,
+                phoneme_ids,
+                speaker_id,
+                sample_rate,
+            )
+        )
+
+    json.dump(
+        {
+            "load_sec": load_sec,
+            "rtf_mean": statistics.mean(synthesize_rtf),
+            "rtf_stdev": statistics.stdev(synthesize_rtf),
+            "rtfs": synthesize_rtf,
+        },
+        sys.stdout,
+    )
+
+
+def synthesize(session, phoneme_ids, speaker_id, sample_rate) -> float:
+    phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
+    phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
+    scales = np.array(
+        [_NOISE_SCALE, _LENGTH_SCALE, _NOISE_W],
+        dtype=np.float32,
+    )
+
+    sid = None
+
+    if speaker_id is not None:
+        sid = np.array([speaker_id], dtype=np.int64)
+
+    # Synthesize through Onnx
+    start_time = time.monotonic_ns()
+    audio = session.run(
+        None,
+        {
+            "input": phoneme_ids_array,
+            "input_lengths": phoneme_ids_lengths,
+            "scales": scales,
+            "sid": sid,
+        },
+    )[0].squeeze()
+    end_time = time.monotonic_ns()
+
+    audio_sec = len(audio) / sample_rate
+    infer_sec = (end_time - start_time) / 1e9
+    rtf = infer_sec / audio_sec
+
+    _LOGGER.debug(
+        "Real-time factor: %s (infer=%s sec, audio=%s sec)",
+        rtf,
+        infer_sec,
+        audio_sec,
+    )
+
+    return rtf
+
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/benchmark/benchmark_torchscript.py
+++ b/mlu_370-piper/piper/src/benchmark/benchmark_torchscript.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+import json
+import time
+import statistics
+import sys
+
+import torch
+
+_NOISE_SCALE = 0.667
+_LENGTH_SCALE = 1.0
+_NOISE_W = 0.8
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-m", "--model", required=True, help="Path to Torchscript file (.ts)"
+    )
+    parser.add_argument("-c", "--config", help="Path to model config file (.json)")
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.DEBUG)
+
+    if not args.config:
+        args.config = f"{args.model}.json"
+
+    with open(args.config, "r", encoding="utf-8") as config_file:
+        config = json.load(config_file)
+
+    sample_rate = config["audio"]["sample_rate"]
+    utterances = [json.loads(line) for line in sys.stdin]
+
+    start_time = time.monotonic_ns()
+    model = torch.jit.load(args.model)
+    end_time = time.monotonic_ns()
+
+    model.eval()
+
+    load_sec = (end_time - start_time) / 1e9
+    synthesize_rtf = []
+    for utterance in utterances:
+        phoneme_ids = utterance["phoneme_ids"]
+        speaker_id = utterance.get("speaker_id")
+        synthesize_rtf.append(
+            synthesize(
+                model,
+                phoneme_ids,
+                speaker_id,
+                sample_rate,
+            )
+        )
+
+    json.dump(
+        {
+            "load_sec": load_sec,
+            "rtf_mean": statistics.mean(synthesize_rtf),
+            "rtf_stdev": statistics.stdev(synthesize_rtf),
+            "synthesize_rtf": synthesize_rtf,
+        },
+        sys.stdout,
+    )
+
+
+def synthesize(model, phoneme_ids, speaker_id, sample_rate) -> float:
+    text = torch.LongTensor(phoneme_ids).unsqueeze(0)
+    text_lengths = torch.LongTensor([len(phoneme_ids)])
+    sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
+
+    start_time = time.monotonic_ns()
+    audio = (
+        model(
+            text,
+            text_lengths,
+            sid,
+            torch.FloatTensor([_NOISE_SCALE]),
+            torch.FloatTensor([_LENGTH_SCALE]),
+            torch.FloatTensor([_NOISE_W]),
+        )[0]
+        .detach()
+        .numpy()
+        .squeeze()
+    )
+    end_time = time.monotonic_ns()
+
+    audio_sec = len(audio) / sample_rate
+    infer_sec = (end_time - start_time) / 1e9
+    rtf = infer_sec / audio_sec
+
+    _LOGGER.debug(
+        "Real-time factor: %s (infer=%s sec, audio=%s sec)",
+        rtf,
+        infer_sec,
+        audio_sec,
+    )
+
+    return rtf
+
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/benchmark/requirements.txt
+++ b/mlu_370-piper/piper/src/benchmark/requirements.txt
@@ -0,0 +1,2 @@
+onnxruntime~=1.11.0
+torch~=1.11.0
--- a/mlu_370-piper/piper/src/cpp/json.hpp
+++ b/mlu_370-piper/piper/src/cpp/json.hpp
--- a/mlu_370-piper/piper/src/cpp/main.cpp
+++ b/mlu_370-piper/piper/src/cpp/main.cpp
@@ -0,0 +1,561 @@
+#include <chrono>
+#include <condition_variable>
+#include <filesystem>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <thread>
+#include <vector>
+
+#ifdef _MSC_VER
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#endif
+
+#ifdef _WIN32
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#ifdef __APPLE__
+#include <mach-o/dyld.h>
+#endif
+
+#include <spdlog/sinks/stdout_color_sinks.h>
+#include <spdlog/spdlog.h>
+
+#include "json.hpp"
+#include "piper.hpp"
+
+using namespace std;
+using json = nlohmann::json;
+
+enum OutputType { OUTPUT_FILE, OUTPUT_DIRECTORY, OUTPUT_STDOUT, OUTPUT_RAW };
+
+struct RunConfig {
+  // Path to .onnx voice file
+  filesystem::path modelPath;
+
+  // Path to JSON voice config file
+  filesystem::path modelConfigPath;
+
+  // Type of output to produce.
+  // Default is to write a WAV file in the current directory.
+  OutputType outputType = OUTPUT_DIRECTORY;
+
+  // Path for output
+  optional<filesystem::path> outputPath = filesystem::path(".");
+
+  // Numerical id of the default speaker (multi-speaker voices)
+  optional<piper::SpeakerId> speakerId;
+
+  // Amount of noise to add during audio generation
+  optional<float> noiseScale;
+
+  // Speed of speaking (1 = normal, < 1 is faster, > 1 is slower)
+  optional<float> lengthScale;
+
+  // Variation in phoneme lengths
+  optional<float> noiseW;
+
+  // Seconds of silence to add after each sentence
+  optional<float> sentenceSilenceSeconds;
+
+  // Path to espeak-ng data directory (default is next to piper executable)
+  optional<filesystem::path> eSpeakDataPath;
+
+  // Path to libtashkeel ort model
+  // https://github.com/mush42/libtashkeel/
+  optional<filesystem::path> tashkeelModelPath;
+
+  // stdin input is lines of JSON instead of text with format:
+  // {
+  //   "text": str,               (required)
+  //   "speaker_id": int,         (optional)
+  //   "speaker": str,            (optional)
+  //   "output_file": str,        (optional)
+  // }
+  bool jsonInput = false;
+
+  // Seconds of extra silence to insert after a single phoneme
+  optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
+
+  // true to use CUDA execution provider
+  bool useCuda = false;
+};
+
+void parseArgs(int argc, char *argv[], RunConfig &runConfig);
+void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
+                   condition_variable &cvAudio, bool &audioReady,
+                   bool &audioFinished);
+
+// ----------------------------------------------------------------------------
+
+int main(int argc, char *argv[]) {
+  spdlog::set_default_logger(spdlog::stderr_color_st("piper"));
+
+  RunConfig runConfig;
+  parseArgs(argc, argv, runConfig);
+
+#ifdef _WIN32
+  // Required on Windows to show IPA symbols
+  SetConsoleOutputCP(CP_UTF8);
+#endif
+
+  piper::PiperConfig piperConfig;
+  piper::Voice voice;
+
+  spdlog::debug("Loading voice from {} (config={})",
+                runConfig.modelPath.string(),
+                runConfig.modelConfigPath.string());
+
+  auto startTime = chrono::steady_clock::now();
+  loadVoice(piperConfig, runConfig.modelPath.string(),
+            runConfig.modelConfigPath.string(), voice, runConfig.speakerId,
+            runConfig.useCuda);
+  auto endTime = chrono::steady_clock::now();
+  spdlog::info("Loaded voice in {} second(s)",
+               chrono::duration<double>(endTime - startTime).count());
+
+  // Get the path to the piper executable so we can locate espeak-ng-data, etc.
+  // next to it.
+#ifdef _MSC_VER
+  auto exePath = []() {
+    wchar_t moduleFileName[MAX_PATH] = {0};
+    GetModuleFileNameW(nullptr, moduleFileName, std::size(moduleFileName));
+    return filesystem::path(moduleFileName);
+  }();
+#else
+#ifdef __APPLE__
+  auto exePath = []() {
+    char moduleFileName[PATH_MAX] = {0};
+    uint32_t moduleFileNameSize = std::size(moduleFileName);
+    _NSGetExecutablePath(moduleFileName, &moduleFileNameSize);
+    return filesystem::path(moduleFileName);
+  }();
+#else
+  auto exePath = filesystem::canonical("/proc/self/exe");
+#endif
+#endif
+
+  if (voice.phonemizeConfig.phonemeType == piper::eSpeakPhonemes) {
+    spdlog::debug("Voice uses eSpeak phonemes ({})",
+                  voice.phonemizeConfig.eSpeak.voice);
+
+    if (runConfig.eSpeakDataPath) {
+      // User provided path
+      piperConfig.eSpeakDataPath = runConfig.eSpeakDataPath.value().string();
+    } else {
+      // Assume next to piper executable
+      piperConfig.eSpeakDataPath =
+          std::filesystem::absolute(
+              exePath.parent_path().append("espeak-ng-data"))
+              .string();
+
+      spdlog::debug("espeak-ng-data directory is expected at {}",
+                    piperConfig.eSpeakDataPath);
+    }
+  } else {
+    // Not using eSpeak
+    piperConfig.useESpeak = false;
+  }
+
+  // Enable libtashkeel for Arabic
+  if (voice.phonemizeConfig.eSpeak.voice == "ar") {
+    piperConfig.useTashkeel = true;
+    if (runConfig.tashkeelModelPath) {
+      // User provided path
+      piperConfig.tashkeelModelPath =
+          runConfig.tashkeelModelPath.value().string();
+    } else {
+      // Assume next to piper executable
+      piperConfig.tashkeelModelPath =
+          std::filesystem::absolute(
+              exePath.parent_path().append("libtashkeel_model.ort"))
+              .string();
+
+      spdlog::debug("libtashkeel model is expected at {}",
+                    piperConfig.tashkeelModelPath.value());
+    }
+  }
+
+  piper::initialize(piperConfig);
+
+  // Scales
+  if (runConfig.noiseScale) {
+    voice.synthesisConfig.noiseScale = runConfig.noiseScale.value();
+  }
+
+  if (runConfig.lengthScale) {
+    voice.synthesisConfig.lengthScale = runConfig.lengthScale.value();
+  }
+
+  if (runConfig.noiseW) {
+    voice.synthesisConfig.noiseW = runConfig.noiseW.value();
+  }
+
+  if (runConfig.sentenceSilenceSeconds) {
+    voice.synthesisConfig.sentenceSilenceSeconds =
+        runConfig.sentenceSilenceSeconds.value();
+  }
+
+  if (runConfig.phonemeSilenceSeconds) {
+    if (!voice.synthesisConfig.phonemeSilenceSeconds) {
+      // Overwrite
+      voice.synthesisConfig.phonemeSilenceSeconds =
+          runConfig.phonemeSilenceSeconds;
+    } else {
+      // Merge
+      for (const auto &[phoneme, silenceSeconds] :
+           *runConfig.phonemeSilenceSeconds) {
+        voice.synthesisConfig.phonemeSilenceSeconds->try_emplace(
+            phoneme, silenceSeconds);
+      }
+    }
+
+  } // if phonemeSilenceSeconds
+
+  if (runConfig.outputType == OUTPUT_DIRECTORY) {
+    runConfig.outputPath = filesystem::absolute(runConfig.outputPath.value());
+    spdlog::info("Output directory: {}", runConfig.outputPath.value().string());
+  }
+
+  string line;
+  piper::SynthesisResult result;
+  while (getline(cin, line)) {
+    auto outputType = runConfig.outputType;
+    auto speakerId = voice.synthesisConfig.speakerId;
+    std::optional<filesystem::path> maybeOutputPath = runConfig.outputPath;
+
+    if (runConfig.jsonInput) {
+      // Each line is a JSON object
+      json lineRoot = json::parse(line);
+
+      // Text is required
+      line = lineRoot["text"].get<std::string>();
+
+      if (lineRoot.contains("output_file")) {
+        // Override output WAV file path
+        outputType = OUTPUT_FILE;
+        maybeOutputPath =
+            filesystem::path(lineRoot["output_file"].get<std::string>());
+      }
+
+      if (lineRoot.contains("speaker_id")) {
+        // Override speaker id
+        voice.synthesisConfig.speakerId =
+            lineRoot["speaker_id"].get<piper::SpeakerId>();
+      } else if (lineRoot.contains("speaker")) {
+        // Resolve to id using speaker id map
+        auto speakerName = lineRoot["speaker"].get<std::string>();
+        if ((voice.modelConfig.speakerIdMap) &&
+            (voice.modelConfig.speakerIdMap->count(speakerName) > 0)) {
+          voice.synthesisConfig.speakerId =
+              (*voice.modelConfig.speakerIdMap)[speakerName];
+        } else {
+          spdlog::warn("No speaker named: {}", speakerName);
+        }
+      }
+    }
+
+    // Timestamp is used for path to output WAV file
+    const auto now = chrono::system_clock::now();
+    const auto timestamp =
+        chrono::duration_cast<chrono::nanoseconds>(now.time_since_epoch())
+            .count();
+
+    if (outputType == OUTPUT_DIRECTORY) {
+      // Generate path using timestamp
+      stringstream outputName;
+      outputName << timestamp << ".wav";
+      filesystem::path outputPath = runConfig.outputPath.value();
+      outputPath.append(outputName.str());
+
+      // Output audio to automatically-named WAV file in a directory
+      ofstream audioFile(outputPath.string(), ios::binary);
+      piper::textToWavFile(piperConfig, voice, line, audioFile, result);
+      cout << outputPath.string() << endl;
+    } else if (outputType == OUTPUT_FILE) {
+      if (!maybeOutputPath || maybeOutputPath->empty()) {
+        throw runtime_error("No output path provided");
+      }
+
+      filesystem::path outputPath = maybeOutputPath.value();
+
+      if (!runConfig.jsonInput) {
+        // Read all of standard input before synthesizing.
+        // Otherwise, we would overwrite the output file for each line.
+        stringstream text;
+        text << line;
+        while (getline(cin, line)) {
+          text << " " << line;
+        }
+
+        line = text.str();
+      }
+
+      // Output audio to WAV file
+      ofstream audioFile(outputPath.string(), ios::binary);
+      piper::textToWavFile(piperConfig, voice, line, audioFile, result);
+      cout << outputPath.string() << endl;
+    } else if (outputType == OUTPUT_STDOUT) {
+      // Output WAV to stdout
+      piper::textToWavFile(piperConfig, voice, line, cout, result);
+    } else if (outputType == OUTPUT_RAW) {
+      // Raw output to stdout
+      mutex mutAudio;
+      condition_variable cvAudio;
+      bool audioReady = false;
+      bool audioFinished = false;
+      vector<int16_t> audioBuffer;
+      vector<int16_t> sharedAudioBuffer;
+
+#ifdef _WIN32
+      // Needed on Windows to avoid terminal conversions
+      setmode(fileno(stdout), O_BINARY);
+      setmode(fileno(stdin), O_BINARY);
+#endif
+
+      thread rawOutputThread(rawOutputProc, ref(sharedAudioBuffer),
+                             ref(mutAudio), ref(cvAudio), ref(audioReady),
+                             ref(audioFinished));
+      auto audioCallback = [&audioBuffer, &sharedAudioBuffer, &mutAudio,
+                            &cvAudio, &audioReady]() {
+        // Signal thread that audio is ready
+        {
+          unique_lock lockAudio(mutAudio);
+          copy(audioBuffer.begin(), audioBuffer.end(),
+               back_inserter(sharedAudioBuffer));
+          audioReady = true;
+          cvAudio.notify_one();
+        }
+      };
+      piper::textToAudio(piperConfig, voice, line, audioBuffer, result,
+                         audioCallback);
+
+      // Signal thread that there is no more audio
+      {
+        unique_lock lockAudio(mutAudio);
+        audioReady = true;
+        audioFinished = true;
+        cvAudio.notify_one();
+      }
+
+      // Wait for audio output to finish
+      spdlog::info("Waiting for audio to finish playing...");
+      rawOutputThread.join();
+    }
+
+    spdlog::info("Real-time factor: {} (infer={} sec, audio={} sec)",
+                 result.realTimeFactor, result.inferSeconds,
+                 result.audioSeconds);
+
+    // Restore config (--json-input)
+    voice.synthesisConfig.speakerId = speakerId;
+
+  } // for each line
+
+  piper::terminate(piperConfig);
+
+  return EXIT_SUCCESS;
+}
+
+// ----------------------------------------------------------------------------
+
+void rawOutputProc(vector<int16_t> &sharedAudioBuffer, mutex &mutAudio,
+                   condition_variable &cvAudio, bool &audioReady,
+                   bool &audioFinished) {
+  vector<int16_t> internalAudioBuffer;
+  while (true) {
+    {
+      unique_lock lockAudio{mutAudio};
+      cvAudio.wait(lockAudio, [&audioReady] { return audioReady; });
+
+      if (sharedAudioBuffer.empty() && audioFinished) {
+        break;
+      }
+
+      copy(sharedAudioBuffer.begin(), sharedAudioBuffer.end(),
+           back_inserter(internalAudioBuffer));
+
+      sharedAudioBuffer.clear();
+
+      if (!audioFinished) {
+        audioReady = false;
+      }
+    }
+
+    cout.write((const char *)internalAudioBuffer.data(),
+               sizeof(int16_t) * internalAudioBuffer.size());
+    cout.flush();
+    internalAudioBuffer.clear();
+  }
+
+} // rawOutputProc
+
+// ----------------------------------------------------------------------------
+
+void printUsage(char *argv[]) {
+  cerr << endl;
+  cerr << "usage: " << argv[0] << " [options]" << endl;
+  cerr << endl;
+  cerr << "options:" << endl;
+  cerr << "   -h        --help              show this message and exit" << endl;
+  cerr << "   -m  FILE  --model       FILE  path to onnx model file" << endl;
+  cerr << "   -c  FILE  --config      FILE  path to model config file "
+          "(default: model path + .json)"
+       << endl;
+  cerr << "   -f  FILE  --output_file FILE  path to output WAV file ('-' for "
+          "stdout)"
+       << endl;
+  cerr << "   -d  DIR   --output_dir  DIR   path to output directory (default: "
+          "cwd)"
+       << endl;
+  cerr << "   --output_raw                  output raw audio to stdout as it "
+          "becomes available"
+       << endl;
+  cerr << "   -s  NUM   --speaker     NUM   id of speaker (default: 0)" << endl;
+  cerr << "   --noise_scale           NUM   generator noise (default: 0.667)"
+       << endl;
+  cerr << "   --length_scale          NUM   phoneme length (default: 1.0)"
+       << endl;
+  cerr << "   --noise_w               NUM   phoneme width noise (default: 0.8)"
+       << endl;
+  cerr << "   --sentence_silence      NUM   seconds of silence after each "
+          "sentence (default: 0.2)"
+       << endl;
+  cerr << "   --espeak_data           DIR   path to espeak-ng data directory"
+       << endl;
+  cerr << "   --tashkeel_model        FILE  path to libtashkeel onnx model "
+          "(arabic)"
+       << endl;
+  cerr << "   --json-input                  stdin input is lines of JSON "
+          "instead of plain text"
+       << endl;
+  cerr << "   --use-cuda                    use CUDA execution provider"
+       << endl;
+  cerr << "   --debug                       print DEBUG messages to the console"
+       << endl;
+  cerr << "   -q       --quiet              disable logging" << endl;
+  cerr << endl;
+}
+
+void ensureArg(int argc, char *argv[], int argi) {
+  if ((argi + 1) >= argc) {
+    printUsage(argv);
+    exit(0);
+  }
+}
+
+// Parse command-line arguments
+void parseArgs(int argc, char *argv[], RunConfig &runConfig) {
+  optional<filesystem::path> modelConfigPath;
+
+  for (int i = 1; i < argc; i++) {
+    std::string arg = argv[i];
+
+    if (arg == "-m" || arg == "--model") {
+      ensureArg(argc, argv, i);
+      runConfig.modelPath = filesystem::path(argv[++i]);
+    } else if (arg == "-c" || arg == "--config") {
+      ensureArg(argc, argv, i);
+      modelConfigPath = filesystem::path(argv[++i]);
+    } else if (arg == "-f" || arg == "--output_file" ||
+               arg == "--output-file") {
+      ensureArg(argc, argv, i);
+      std::string filePath = argv[++i];
+      if (filePath == "-") {
+        runConfig.outputType = OUTPUT_STDOUT;
+        runConfig.outputPath = nullopt;
+      } else {
+        runConfig.outputType = OUTPUT_FILE;
+        runConfig.outputPath = filesystem::path(filePath);
+      }
+    } else if (arg == "-d" || arg == "--output_dir" || arg == "output-dir") {
+      ensureArg(argc, argv, i);
+      runConfig.outputType = OUTPUT_DIRECTORY;
+      runConfig.outputPath = filesystem::path(argv[++i]);
+    } else if (arg == "--output_raw" || arg == "--output-raw") {
+      runConfig.outputType = OUTPUT_RAW;
+    } else if (arg == "-s" || arg == "--speaker") {
+      ensureArg(argc, argv, i);
+      runConfig.speakerId = (piper::SpeakerId)stol(argv[++i]);
+    } else if (arg == "--noise_scale" || arg == "--noise-scale") {
+      ensureArg(argc, argv, i);
+      runConfig.noiseScale = stof(argv[++i]);
+    } else if (arg == "--length_scale" || arg == "--length-scale") {
+      ensureArg(argc, argv, i);
+      runConfig.lengthScale = stof(argv[++i]);
+    } else if (arg == "--noise_w" || arg == "--noise-w") {
+      ensureArg(argc, argv, i);
+      runConfig.noiseW = stof(argv[++i]);
+    } else if (arg == "--sentence_silence" || arg == "--sentence-silence") {
+      ensureArg(argc, argv, i);
+      runConfig.sentenceSilenceSeconds = stof(argv[++i]);
+    } else if (arg == "--phoneme_silence" || arg == "--phoneme-silence") {
+      ensureArg(argc, argv, i);
+      ensureArg(argc, argv, i + 1);
+      auto phonemeStr = std::string(argv[++i]);
+      if (!piper::isSingleCodepoint(phonemeStr)) {
+        std::cerr << "Phoneme '" << phonemeStr
+                  << "' is not a single codepoint (--phoneme_silence)"
+                  << std::endl;
+        exit(1);
+      }
+
+      if (!runConfig.phonemeSilenceSeconds) {
+        runConfig.phonemeSilenceSeconds.emplace();
+      }
+
+      auto phoneme = piper::getCodepoint(phonemeStr);
+      (*runConfig.phonemeSilenceSeconds)[phoneme] = stof(argv[++i]);
+    } else if (arg == "--espeak_data" || arg == "--espeak-data") {
+      ensureArg(argc, argv, i);
+      runConfig.eSpeakDataPath = filesystem::path(argv[++i]);
+    } else if (arg == "--tashkeel_model" || arg == "--tashkeel-model") {
+      ensureArg(argc, argv, i);
+      runConfig.tashkeelModelPath = filesystem::path(argv[++i]);
+    } else if (arg == "--json_input" || arg == "--json-input") {
+      runConfig.jsonInput = true;
+    } else if (arg == "--use_cuda" || arg == "--use-cuda") {
+      runConfig.useCuda = true;
+    } else if (arg == "--version") {
+      std::cout << piper::getVersion() << std::endl;
+      exit(0);
+    } else if (arg == "--debug") {
+      // Set DEBUG logging
+      spdlog::set_level(spdlog::level::debug);
+    } else if (arg == "-q" || arg == "--quiet") {
+      // diable logging
+      spdlog::set_level(spdlog::level::off);
+    } else if (arg == "-h" || arg == "--help") {
+      printUsage(argv);
+      exit(0);
+    }
+  }
+
+  // Verify model file exists
+  ifstream modelFile(runConfig.modelPath.c_str(), ios::binary);
+  if (!modelFile.good()) {
+    throw runtime_error("Model file doesn't exist");
+  }
+
+  if (!modelConfigPath) {
+    runConfig.modelConfigPath =
+        filesystem::path(runConfig.modelPath.string() + ".json");
+  } else {
+    runConfig.modelConfigPath = modelConfigPath.value();
+  }
+
+  // Verify model config exists
+  ifstream modelConfigFile(runConfig.modelConfigPath.c_str());
+  if (!modelConfigFile.good()) {
+    throw runtime_error("Model config doesn't exist");
+  }
+}
--- a/mlu_370-piper/piper/src/cpp/piper.cpp
+++ b/mlu_370-piper/piper/src/cpp/piper.cpp
@@ -0,0 +1,636 @@
+#include <array>
+#include <chrono>
+#include <fstream>
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+
+#include <espeak-ng/speak_lib.h>
+#include <onnxruntime_cxx_api.h>
+#include <spdlog/spdlog.h>
+
+#include "json.hpp"
+#include "piper.hpp"
+#include "utf8.h"
+#include "wavfile.hpp"
+
+namespace piper {
+
+#ifdef _PIPER_VERSION
+// https://stackoverflow.com/questions/47346133/how-to-use-a-define-inside-a-format-string
+#define _STR(x) #x
+#define STR(x) _STR(x)
+const std::string VERSION = STR(_PIPER_VERSION);
+#else
+const std::string VERSION = "";
+#endif
+
+// Maximum value for 16-bit signed WAV sample
+const float MAX_WAV_VALUE = 32767.0f;
+
+const std::string instanceName{"piper"};
+
+std::string getVersion() { return VERSION; }
+
+// True if the string is a single UTF-8 codepoint
+bool isSingleCodepoint(std::string s) {
+  return utf8::distance(s.begin(), s.end()) == 1;
+}
+
+// Get the first UTF-8 codepoint of a string
+Phoneme getCodepoint(std::string s) {
+  utf8::iterator character_iter(s.begin(), s.begin(), s.end());
+  return *character_iter;
+}
+
+// Load JSON config information for phonemization
+void parsePhonemizeConfig(json &configRoot, PhonemizeConfig &phonemizeConfig) {
+  // {
+  //     "espeak": {
+  //         "voice": "<language code>"
+  //     },
+  //     "phoneme_type": "<espeak or text>",
+  //     "phoneme_map": {
+  //         "<from phoneme>": ["<to phoneme 1>", "<to phoneme 2>", ...]
+  //     },
+  //     "phoneme_id_map": {
+  //         "<phoneme>": [<id1>, <id2>, ...]
+  //     }
+  // }
+
+  if (configRoot.contains("espeak")) {
+    auto espeakValue = configRoot["espeak"];
+    if (espeakValue.contains("voice")) {
+      phonemizeConfig.eSpeak.voice = espeakValue["voice"].get<std::string>();
+    }
+  }
+
+  if (configRoot.contains("phoneme_type")) {
+    auto phonemeTypeStr = configRoot["phoneme_type"].get<std::string>();
+    if (phonemeTypeStr == "text") {
+      phonemizeConfig.phonemeType = TextPhonemes;
+    }
+  }
+
+  // phoneme to [id] map
+  // Maps phonemes to one or more phoneme ids (required).
+  if (configRoot.contains("phoneme_id_map")) {
+    auto phonemeIdMapValue = configRoot["phoneme_id_map"];
+    for (auto &fromPhonemeItem : phonemeIdMapValue.items()) {
+      std::string fromPhoneme = fromPhonemeItem.key();
+      if (!isSingleCodepoint(fromPhoneme)) {
+        std::stringstream idsStr;
+        for (auto &toIdValue : fromPhonemeItem.value()) {
+          PhonemeId toId = toIdValue.get<PhonemeId>();
+          idsStr << toId << ",";
+        }
+
+        spdlog::error("\"{}\" is not a single codepoint (ids={})", fromPhoneme,
+                      idsStr.str());
+        throw std::runtime_error(
+            "Phonemes must be one codepoint (phoneme id map)");
+      }
+
+      auto fromCodepoint = getCodepoint(fromPhoneme);
+      for (auto &toIdValue : fromPhonemeItem.value()) {
+        PhonemeId toId = toIdValue.get<PhonemeId>();
+        phonemizeConfig.phonemeIdMap[fromCodepoint].push_back(toId);
+      }
+    }
+  }
+
+  // phoneme to [phoneme] map
+  // Maps phonemes to one or more other phonemes (not normally used).
+  if (configRoot.contains("phoneme_map")) {
+    if (!phonemizeConfig.phonemeMap) {
+      phonemizeConfig.phonemeMap.emplace();
+    }
+
+    auto phonemeMapValue = configRoot["phoneme_map"];
+    for (auto &fromPhonemeItem : phonemeMapValue.items()) {
+      std::string fromPhoneme = fromPhonemeItem.key();
+      if (!isSingleCodepoint(fromPhoneme)) {
+        spdlog::error("\"{}\" is not a single codepoint", fromPhoneme);
+        throw std::runtime_error(
+            "Phonemes must be one codepoint (phoneme map)");
+      }
+
+      auto fromCodepoint = getCodepoint(fromPhoneme);
+      for (auto &toPhonemeValue : fromPhonemeItem.value()) {
+        std::string toPhoneme = toPhonemeValue.get<std::string>();
+        if (!isSingleCodepoint(toPhoneme)) {
+          throw std::runtime_error(
+              "Phonemes must be one codepoint (phoneme map)");
+        }
+
+        auto toCodepoint = getCodepoint(toPhoneme);
+        (*phonemizeConfig.phonemeMap)[fromCodepoint].push_back(toCodepoint);
+      }
+    }
+  }
+
+} /* parsePhonemizeConfig */
+
+// Load JSON config for audio synthesis
+void parseSynthesisConfig(json &configRoot, SynthesisConfig &synthesisConfig) {
+  // {
+  //     "audio": {
+  //         "sample_rate": 22050
+  //     },
+  //     "inference": {
+  //         "noise_scale": 0.667,
+  //         "length_scale": 1,
+  //         "noise_w": 0.8,
+  //         "phoneme_silence": {
+  //           "<phoneme>": <seconds of silence>,
+  //           ...
+  //         }
+  //     }
+  // }
+
+  if (configRoot.contains("audio")) {
+    auto audioValue = configRoot["audio"];
+    if (audioValue.contains("sample_rate")) {
+      // Default sample rate is 22050 Hz
+      synthesisConfig.sampleRate = audioValue.value("sample_rate", 22050);
+    }
+  }
+
+  if (configRoot.contains("inference")) {
+    // Overrides default inference settings
+    auto inferenceValue = configRoot["inference"];
+    if (inferenceValue.contains("noise_scale")) {
+      synthesisConfig.noiseScale = inferenceValue.value("noise_scale", 0.667f);
+    }
+
+    if (inferenceValue.contains("length_scale")) {
+      synthesisConfig.lengthScale = inferenceValue.value("length_scale", 1.0f);
+    }
+
+    if (inferenceValue.contains("noise_w")) {
+      synthesisConfig.noiseW = inferenceValue.value("noise_w", 0.8f);
+    }
+
+    if (inferenceValue.contains("phoneme_silence")) {
+      // phoneme -> seconds of silence to add after
+      synthesisConfig.phonemeSilenceSeconds.emplace();
+      auto phonemeSilenceValue = inferenceValue["phoneme_silence"];
+      for (auto &phonemeItem : phonemeSilenceValue.items()) {
+        std::string phonemeStr = phonemeItem.key();
+        if (!isSingleCodepoint(phonemeStr)) {
+          spdlog::error("\"{}\" is not a single codepoint", phonemeStr);
+          throw std::runtime_error(
+              "Phonemes must be one codepoint (phoneme silence)");
+        }
+
+        auto phoneme = getCodepoint(phonemeStr);
+        (*synthesisConfig.phonemeSilenceSeconds)[phoneme] =
+            phonemeItem.value().get<float>();
+      }
+
+    } // if phoneme_silence
+
+  } // if inference
+
+} /* parseSynthesisConfig */
+
+void parseModelConfig(json &configRoot, ModelConfig &modelConfig) {
+
+  modelConfig.numSpeakers = configRoot["num_speakers"].get<SpeakerId>();
+
+  if (configRoot.contains("speaker_id_map")) {
+    if (!modelConfig.speakerIdMap) {
+      modelConfig.speakerIdMap.emplace();
+    }
+
+    auto speakerIdMapValue = configRoot["speaker_id_map"];
+    for (auto &speakerItem : speakerIdMapValue.items()) {
+      std::string speakerName = speakerItem.key();
+      (*modelConfig.speakerIdMap)[speakerName] =
+          speakerItem.value().get<SpeakerId>();
+    }
+  }
+
+} /* parseModelConfig */
+
+void initialize(PiperConfig &config) {
+  if (config.useESpeak) {
+    // Set up espeak-ng for calling espeak_TextToPhonemesWithTerminator
+    // See: https://github.com/rhasspy/espeak-ng
+    spdlog::debug("Initializing eSpeak");
+    int result = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
+                                   /*buflength*/ 0,
+                                   /*path*/ config.eSpeakDataPath.c_str(),
+                                   /*options*/ 0);
+    if (result < 0) {
+      throw std::runtime_error("Failed to initialize eSpeak-ng");
+    }
+
+    spdlog::debug("Initialized eSpeak");
+  }
+
+  // Load onnx model for libtashkeel
+  // https://github.com/mush42/libtashkeel/
+  if (config.useTashkeel) {
+    spdlog::debug("Using libtashkeel for diacritization");
+    if (!config.tashkeelModelPath) {
+      throw std::runtime_error("No path to libtashkeel model");
+    }
+
+    spdlog::debug("Loading libtashkeel model from {}",
+                  config.tashkeelModelPath.value());
+    config.tashkeelState = std::make_unique<tashkeel::State>();
+    tashkeel::tashkeel_load(config.tashkeelModelPath.value(),
+                            *config.tashkeelState);
+    spdlog::debug("Initialized libtashkeel");
+  }
+
+  spdlog::info("Initialized piper");
+}
+
+void terminate(PiperConfig &config) {
+  if (config.useESpeak) {
+    // Clean up espeak-ng
+    spdlog::debug("Terminating eSpeak");
+    espeak_Terminate();
+    spdlog::debug("Terminated eSpeak");
+  }
+
+  spdlog::info("Terminated piper");
+}
+
+void loadModel(std::string modelPath, ModelSession &session, bool useCuda) {
+  spdlog::debug("Loading onnx model from {}", modelPath);
+  session.env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING,
+                         instanceName.c_str());
+  session.env.DisableTelemetryEvents();
+
+  if (useCuda) {
+    // Use CUDA provider
+    OrtCUDAProviderOptions cuda_options{};
+    cuda_options.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchHeuristic;
+    session.options.AppendExecutionProvider_CUDA(cuda_options);
+  }
+
+  // Slows down performance by ~2x
+  // session.options.SetIntraOpNumThreads(1);
+
+  // Roughly doubles load time for no visible inference benefit
+  // session.options.SetGraphOptimizationLevel(
+  //     GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
+
+  session.options.SetGraphOptimizationLevel(
+      GraphOptimizationLevel::ORT_DISABLE_ALL);
+
+  // Slows down performance very slightly
+  // session.options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+
+  session.options.DisableCpuMemArena();
+  session.options.DisableMemPattern();
+  session.options.DisableProfiling();
+
+  auto startTime = std::chrono::steady_clock::now();
+
+#ifdef _WIN32
+  auto modelPathW = std::wstring(modelPath.begin(), modelPath.end());
+  auto modelPathStr = modelPathW.c_str();
+#else
+  auto modelPathStr = modelPath.c_str();
+#endif
+
+  session.onnx = Ort::Session(session.env, modelPathStr, session.options);
+
+  auto endTime = std::chrono::steady_clock::now();
+  spdlog::debug("Loaded onnx model in {} second(s)",
+                std::chrono::duration<double>(endTime - startTime).count());
+}
+
+// Load Onnx model and JSON config file
+void loadVoice(PiperConfig &config, std::string modelPath,
+               std::string modelConfigPath, Voice &voice,
+               std::optional<SpeakerId> &speakerId, bool useCuda) {
+  spdlog::debug("Parsing voice config at {}", modelConfigPath);
+  std::ifstream modelConfigFile(modelConfigPath);
+  voice.configRoot = json::parse(modelConfigFile);
+
+  parsePhonemizeConfig(voice.configRoot, voice.phonemizeConfig);
+  parseSynthesisConfig(voice.configRoot, voice.synthesisConfig);
+  parseModelConfig(voice.configRoot, voice.modelConfig);
+
+  if (voice.modelConfig.numSpeakers > 1) {
+    // Multi-speaker model
+    if (speakerId) {
+      voice.synthesisConfig.speakerId = speakerId;
+    } else {
+      // Default speaker
+      voice.synthesisConfig.speakerId = 0;
+    }
+  }
+
+  spdlog::debug("Voice contains {} speaker(s)", voice.modelConfig.numSpeakers);
+
+  loadModel(modelPath, voice.session, useCuda);
+
+} /* loadVoice */
+
+// Phoneme ids to WAV audio
+void synthesize(std::vector<PhonemeId> &phonemeIds,
+                SynthesisConfig &synthesisConfig, ModelSession &session,
+                std::vector<int16_t> &audioBuffer, SynthesisResult &result) {
+  spdlog::debug("Synthesizing audio for {} phoneme id(s)", phonemeIds.size());
+
+  auto memoryInfo = Ort::MemoryInfo::CreateCpu(
+      OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
+
+  // Allocate
+  std::vector<int64_t> phonemeIdLengths{(int64_t)phonemeIds.size()};
+  std::vector<float> scales{synthesisConfig.noiseScale,
+                            synthesisConfig.lengthScale,
+                            synthesisConfig.noiseW};
+
+  std::vector<Ort::Value> inputTensors;
+  std::vector<int64_t> phonemeIdsShape{1, (int64_t)phonemeIds.size()};
+  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
+      memoryInfo, phonemeIds.data(), phonemeIds.size(), phonemeIdsShape.data(),
+      phonemeIdsShape.size()));
+
+  std::vector<int64_t> phomemeIdLengthsShape{(int64_t)phonemeIdLengths.size()};
+  inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
+      memoryInfo, phonemeIdLengths.data(), phonemeIdLengths.size(),
+      phomemeIdLengthsShape.data(), phomemeIdLengthsShape.size()));
+
+  std::vector<int64_t> scalesShape{(int64_t)scales.size()};
+  inputTensors.push_back(
+      Ort::Value::CreateTensor<float>(memoryInfo, scales.data(), scales.size(),
+                                      scalesShape.data(), scalesShape.size()));
+
+  // Add speaker id.
+  // NOTE: These must be kept outside the "if" below to avoid being deallocated.
+  std::vector<int64_t> speakerId{
+      (int64_t)synthesisConfig.speakerId.value_or(0)};
+  std::vector<int64_t> speakerIdShape{(int64_t)speakerId.size()};
+
+  if (synthesisConfig.speakerId) {
+    inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
+        memoryInfo, speakerId.data(), speakerId.size(), speakerIdShape.data(),
+        speakerIdShape.size()));
+  }
+
+  // From export_onnx.py
+  std::array<const char *, 4> inputNames = {"input", "input_lengths", "scales",
+                                            "sid"};
+  std::array<const char *, 1> outputNames = {"output"};
+
+  // Infer
+  auto startTime = std::chrono::steady_clock::now();
+  auto outputTensors = session.onnx.Run(
+      Ort::RunOptions{nullptr}, inputNames.data(), inputTensors.data(),
+      inputTensors.size(), outputNames.data(), outputNames.size());
+  auto endTime = std::chrono::steady_clock::now();
+
+  if ((outputTensors.size() != 1) || (!outputTensors.front().IsTensor())) {
+    throw std::runtime_error("Invalid output tensors");
+  }
+  auto inferDuration = std::chrono::duration<double>(endTime - startTime);
+  result.inferSeconds = inferDuration.count();
+
+  const float *audio = outputTensors.front().GetTensorData<float>();
+  auto audioShape =
+      outputTensors.front().GetTensorTypeAndShapeInfo().GetShape();
+  int64_t audioCount = audioShape[audioShape.size() - 1];
+
+  result.audioSeconds = (double)audioCount / (double)synthesisConfig.sampleRate;
+  result.realTimeFactor = 0.0;
+  if (result.audioSeconds > 0) {
+    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
+  }
+  spdlog::debug("Synthesized {} second(s) of audio in {} second(s)",
+                result.audioSeconds, result.inferSeconds);
+
+  // Get max audio value for scaling
+  float maxAudioValue = 0.01f;
+  for (int64_t i = 0; i < audioCount; i++) {
+    float audioValue = abs(audio[i]);
+    if (audioValue > maxAudioValue) {
+      maxAudioValue = audioValue;
+    }
+  }
+
+  // We know the size up front
+  audioBuffer.reserve(audioCount);
+
+  // Scale audio to fill range and convert to int16
+  float audioScale = (MAX_WAV_VALUE / std::max(0.01f, maxAudioValue));
+  for (int64_t i = 0; i < audioCount; i++) {
+    int16_t intAudioValue = static_cast<int16_t>(
+        std::clamp(audio[i] * audioScale,
+                   static_cast<float>(std::numeric_limits<int16_t>::min()),
+                   static_cast<float>(std::numeric_limits<int16_t>::max())));
+
+    audioBuffer.push_back(intAudioValue);
+  }
+
+  // Clean up
+  for (std::size_t i = 0; i < outputTensors.size(); i++) {
+    Ort::detail::OrtRelease(outputTensors[i].release());
+  }
+
+  for (std::size_t i = 0; i < inputTensors.size(); i++) {
+    Ort::detail::OrtRelease(inputTensors[i].release());
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+// Phonemize text and synthesize audio
+void textToAudio(PiperConfig &config, Voice &voice, std::string text,
+                 std::vector<int16_t> &audioBuffer, SynthesisResult &result,
+                 const std::function<void()> &audioCallback) {
+
+  std::size_t sentenceSilenceSamples = 0;
+  if (voice.synthesisConfig.sentenceSilenceSeconds > 0) {
+    sentenceSilenceSamples = (std::size_t)(
+        voice.synthesisConfig.sentenceSilenceSeconds *
+        voice.synthesisConfig.sampleRate * voice.synthesisConfig.channels);
+  }
+
+  if (config.useTashkeel) {
+    if (!config.tashkeelState) {
+      throw std::runtime_error("Tashkeel model is not loaded");
+    }
+
+    spdlog::debug("Diacritizing text with libtashkeel: {}", text);
+    text = tashkeel::tashkeel_run(text, *config.tashkeelState);
+  }
+
+  // Phonemes for each sentence
+  spdlog::debug("Phonemizing text: {}", text);
+  std::vector<std::vector<Phoneme>> phonemes;
+
+  if (voice.phonemizeConfig.phonemeType == eSpeakPhonemes) {
+    // Use espeak-ng for phonemization
+    eSpeakPhonemeConfig eSpeakConfig;
+    eSpeakConfig.voice = voice.phonemizeConfig.eSpeak.voice;
+    phonemize_eSpeak(text, eSpeakConfig, phonemes);
+  } else {
+    // Use UTF-8 codepoints as "phonemes"
+    CodepointsPhonemeConfig codepointsConfig;
+    phonemize_codepoints(text, codepointsConfig, phonemes);
+  }
+
+  // Synthesize each sentence independently.
+  std::vector<PhonemeId> phonemeIds;
+  std::map<Phoneme, std::size_t> missingPhonemes;
+  for (auto phonemesIter = phonemes.begin(); phonemesIter != phonemes.end();
+       ++phonemesIter) {
+    std::vector<Phoneme> &sentencePhonemes = *phonemesIter;
+
+    if (spdlog::should_log(spdlog::level::debug)) {
+      // DEBUG log for phonemes
+      std::string phonemesStr;
+      for (auto phoneme : sentencePhonemes) {
+        utf8::append(phoneme, std::back_inserter(phonemesStr));
+      }
+
+      spdlog::debug("Converting {} phoneme(s) to ids: {}",
+                    sentencePhonemes.size(), phonemesStr);
+    }
+
+    std::vector<std::shared_ptr<std::vector<Phoneme>>> phrasePhonemes;
+    std::vector<SynthesisResult> phraseResults;
+    std::vector<size_t> phraseSilenceSamples;
+
+    // Use phoneme/id map from config
+    PhonemeIdConfig idConfig;
+    idConfig.phonemeIdMap =
+        std::make_shared<PhonemeIdMap>(voice.phonemizeConfig.phonemeIdMap);
+
+    if (voice.synthesisConfig.phonemeSilenceSeconds) {
+      // Split into phrases
+      std::map<Phoneme, float> &phonemeSilenceSeconds =
+          *voice.synthesisConfig.phonemeSilenceSeconds;
+
+      auto currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
+      phrasePhonemes.push_back(currentPhrasePhonemes);
+
+      for (auto sentencePhonemesIter = sentencePhonemes.begin();
+           sentencePhonemesIter != sentencePhonemes.end();
+           sentencePhonemesIter++) {
+        Phoneme &currentPhoneme = *sentencePhonemesIter;
+        currentPhrasePhonemes->push_back(currentPhoneme);
+
+        if (phonemeSilenceSeconds.count(currentPhoneme) > 0) {
+          // Split at phrase boundary
+          phraseSilenceSamples.push_back(
+              (std::size_t)(phonemeSilenceSeconds[currentPhoneme] *
+                            voice.synthesisConfig.sampleRate *
+                            voice.synthesisConfig.channels));
+
+          currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
+          phrasePhonemes.push_back(currentPhrasePhonemes);
+        }
+      }
+    } else {
+      // Use all phonemes
+      phrasePhonemes.push_back(
+          std::make_shared<std::vector<Phoneme>>(sentencePhonemes));
+    }
+
+    // Ensure results/samples are the same size
+    while (phraseResults.size() < phrasePhonemes.size()) {
+      phraseResults.emplace_back();
+    }
+
+    while (phraseSilenceSamples.size() < phrasePhonemes.size()) {
+      phraseSilenceSamples.push_back(0);
+    }
+
+    // phonemes -> ids -> audio
+    for (size_t phraseIdx = 0; phraseIdx < phrasePhonemes.size(); phraseIdx++) {
+      if (phrasePhonemes[phraseIdx]->size() <= 0) {
+        continue;
+      }
+
+      // phonemes -> ids
+      phonemes_to_ids(*(phrasePhonemes[phraseIdx]), idConfig, phonemeIds,
+                      missingPhonemes);
+      if (spdlog::should_log(spdlog::level::debug)) {
+        // DEBUG log for phoneme ids
+        std::stringstream phonemeIdsStr;
+        for (auto phonemeId : phonemeIds) {
+          phonemeIdsStr << phonemeId << ", ";
+        }
+
+        spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
+                      phrasePhonemes[phraseIdx]->size(), phonemeIds.size(),
+                      phonemeIdsStr.str());
+      }
+
+      // ids -> audio
+      synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
+                 phraseResults[phraseIdx]);
+
+      // Add end of phrase silence
+      for (std::size_t i = 0; i < phraseSilenceSamples[phraseIdx]; i++) {
+        audioBuffer.push_back(0);
+      }
+
+      result.audioSeconds += phraseResults[phraseIdx].audioSeconds;
+      result.inferSeconds += phraseResults[phraseIdx].inferSeconds;
+
+      phonemeIds.clear();
+    }
+
+    // Add end of sentence silence
+    if (sentenceSilenceSamples > 0) {
+      for (std::size_t i = 0; i < sentenceSilenceSamples; i++) {
+        audioBuffer.push_back(0);
+      }
+    }
+
+    if (audioCallback) {
+      // Call back must copy audio since it is cleared afterwards.
+      audioCallback();
+      audioBuffer.clear();
+    }
+
+    phonemeIds.clear();
+  }
+
+  if (missingPhonemes.size() > 0) {
+    spdlog::warn("Missing {} phoneme(s) from phoneme/id map!",
+                 missingPhonemes.size());
+
+    for (auto phonemeCount : missingPhonemes) {
+      std::string phonemeStr;
+      utf8::append(phonemeCount.first, std::back_inserter(phonemeStr));
+      spdlog::warn("Missing \"{}\" (\\u{:04X}): {} time(s)", phonemeStr,
+                   (uint32_t)phonemeCount.first, phonemeCount.second);
+    }
+  }
+
+  if (result.audioSeconds > 0) {
+    result.realTimeFactor = result.inferSeconds / result.audioSeconds;
+  }
+
+} /* textToAudio */
+
+// Phonemize text and synthesize audio to WAV file
+void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
+                   std::ostream &audioFile, SynthesisResult &result) {
+
+  std::vector<int16_t> audioBuffer;
+  textToAudio(config, voice, text, audioBuffer, result, NULL);
+
+  // Write WAV
+  auto synthesisConfig = voice.synthesisConfig;
+  writeWavHeader(synthesisConfig.sampleRate, synthesisConfig.sampleWidth,
+                 synthesisConfig.channels, (int32_t)audioBuffer.size(),
+                 audioFile);
+
+  audioFile.write((const char *)audioBuffer.data(),
+                  sizeof(int16_t) * audioBuffer.size());
+
+} /* textToWavFile */
+
+} // namespace piper
--- a/mlu_370-piper/piper/src/cpp/piper.hpp
+++ b/mlu_370-piper/piper/src/cpp/piper.hpp
@@ -0,0 +1,132 @@
+#ifndef PIPER_H_
+#define PIPER_H_
+
+#include <fstream>
+#include <functional>
+#include <map>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include <onnxruntime_cxx_api.h>
+#include <piper-phonemize/phoneme_ids.hpp>
+#include <piper-phonemize/phonemize.hpp>
+#include <piper-phonemize/tashkeel.hpp>
+
+#include "json.hpp"
+
+using json = nlohmann::json;
+
+namespace piper {
+
+typedef int64_t SpeakerId;
+
+struct eSpeakConfig {
+  std::string voice = "en-us";
+};
+
+struct PiperConfig {
+  std::string eSpeakDataPath;
+  bool useESpeak = true;
+
+  bool useTashkeel = false;
+  std::optional<std::string> tashkeelModelPath;
+  std::unique_ptr<tashkeel::State> tashkeelState;
+};
+
+enum PhonemeType { eSpeakPhonemes, TextPhonemes };
+
+struct PhonemizeConfig {
+  PhonemeType phonemeType = eSpeakPhonemes;
+  std::optional<std::map<Phoneme, std::vector<Phoneme>>> phonemeMap;
+  std::map<Phoneme, std::vector<PhonemeId>> phonemeIdMap;
+
+  PhonemeId idPad = 0; // padding (optionally interspersed)
+  PhonemeId idBos = 1; // beginning of sentence
+  PhonemeId idEos = 2; // end of sentence
+  bool interspersePad = true;
+
+  eSpeakConfig eSpeak;
+};
+
+struct SynthesisConfig {
+  // VITS inference settings
+  float noiseScale = 0.667f;
+  float lengthScale = 1.0f;
+  float noiseW = 0.8f;
+
+  // Audio settings
+  int sampleRate = 22050;
+  int sampleWidth = 2; // 16-bit
+  int channels = 1;    // mono
+
+  // Speaker id from 0 to numSpeakers - 1
+  std::optional<SpeakerId> speakerId;
+
+  // Extra silence
+  float sentenceSilenceSeconds = 0.2f;
+  std::optional<std::map<piper::Phoneme, float>> phonemeSilenceSeconds;
+};
+
+struct ModelConfig {
+  int numSpeakers;
+
+  // speaker name -> id
+  std::optional<std::map<std::string, SpeakerId>> speakerIdMap;
+};
+
+struct ModelSession {
+  Ort::Session onnx;
+  Ort::AllocatorWithDefaultOptions allocator;
+  Ort::SessionOptions options;
+  Ort::Env env;
+
+  ModelSession() : onnx(nullptr){};
+};
+
+struct SynthesisResult {
+  double inferSeconds;
+  double audioSeconds;
+  double realTimeFactor;
+};
+
+struct Voice {
+  json configRoot;
+  PhonemizeConfig phonemizeConfig;
+  SynthesisConfig synthesisConfig;
+  ModelConfig modelConfig;
+  ModelSession session;
+};
+
+// True if the string is a single UTF-8 codepoint
+bool isSingleCodepoint(std::string s);
+
+// Get the first UTF-8 codepoint of a string
+Phoneme getCodepoint(std::string s);
+
+// Get version of Piper
+std::string getVersion();
+
+// Must be called before using textTo* functions
+void initialize(PiperConfig &config);
+
+// Clean up
+void terminate(PiperConfig &config);
+
+// Load Onnx model and JSON config file
+void loadVoice(PiperConfig &config, std::string modelPath,
+               std::string modelConfigPath, Voice &voice,
+               std::optional<SpeakerId> &speakerId, bool useCuda);
+
+// Phonemize text and synthesize audio
+void textToAudio(PiperConfig &config, Voice &voice, std::string text,
+                 std::vector<int16_t> &audioBuffer, SynthesisResult &result,
+                 const std::function<void()> &audioCallback);
+
+// Phonemize text and synthesize audio to WAV file
+void textToWavFile(PiperConfig &config, Voice &voice, std::string text,
+                   std::ostream &audioFile, SynthesisResult &result);
+
+} // namespace piper
+
+#endif // PIPER_H_
--- a/mlu_370-piper/piper/src/cpp/test.cpp
+++ b/mlu_370-piper/piper/src/cpp/test.cpp
@@ -0,0 +1,60 @@
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "json.hpp"
+#include "piper.hpp"
+
+using namespace std;
+using json = nlohmann::json;
+
+int main(int argc, char *argv[]) {
+  piper::PiperConfig piperConfig;
+  piper::Voice voice;
+
+  if (argc < 2) {
+    std::cerr << "Need voice model path" << std::endl;
+    return 1;
+  }
+
+  if (argc < 3) {
+    std::cerr << "Need espeak-ng-data path" << std::endl;
+    return 1;
+  }
+
+  if (argc < 4) {
+    std::cerr << "Need output WAV path" << std::endl;
+    return 1;
+  }
+
+  auto modelPath = std::string(argv[1]);
+  piperConfig.eSpeakDataPath = std::string(argv[2]);
+  auto outputPath = std::string(argv[3]);
+
+  optional<piper::SpeakerId> speakerId;
+  loadVoice(piperConfig, modelPath, modelPath + ".json", voice, speakerId,
+            false);
+  piper::initialize(piperConfig);
+
+  // Output audio to WAV file
+  ofstream audioFile(outputPath, ios::binary);
+
+  piper::SynthesisResult result;
+  piper::textToWavFile(piperConfig, voice, "This is a test.", audioFile,
+                       result);
+  piper::terminate(piperConfig);
+
+  // Verify that file has some data
+  if (audioFile.tellp() < 10000) {
+    std::cerr << "ERROR: Output file is smaller than expected!" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "OK" << std::endl;
+
+  return EXIT_SUCCESS;
+}
--- a/mlu_370-piper/piper/src/cpp/utf8.h
+++ b/mlu_370-piper/piper/src/cpp/utf8.h
@@ -0,0 +1,34 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include "utf8/checked.h"
+#include "utf8/unchecked.h"
+
+#endif // header guard
--- a/mlu_370-piper/piper/src/cpp/utf8/checked.h
+++ b/mlu_370-piper/piper/src/cpp/utf8/checked.h
@@ -0,0 +1,335 @@
+// Copyright 2006-2016 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include "core.h"
+#include <stdexcept>
+
+namespace utf8
+{
+    // Base for the exceptions that may be thrown from the library
+    class exception : public ::std::exception {
+    };
+
+    // Exceptions that may be thrown from the library functions.
+    class invalid_code_point : public exception {
+        uint32_t cp;
+    public:
+        invalid_code_point(uint32_t codepoint) : cp(codepoint) {}
+        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; }
+        uint32_t code_point() const {return cp;}
+    };
+
+    class invalid_utf8 : public exception {
+        uint8_t u8;
+    public:
+        invalid_utf8 (uint8_t u) : u8(u) {}
+        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; }
+        uint8_t utf8_octet() const {return u8;}
+    };
+
+    class invalid_utf16 : public exception {
+        uint16_t u16;
+    public:
+        invalid_utf16 (uint16_t u) : u16(u) {}
+        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; }
+        uint16_t utf16_word() const {return u16;}
+    };
+
+    class not_enough_room : public exception {
+    public:
+        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; }
+    };
+
+    /// The library API - functions intended to be called by the users
+
+    template <typename octet_iterator>
+    octet_iterator append(uint32_t cp, octet_iterator result)
+    {
+        if (!utf8::internal::is_code_point_valid(cp))
+            throw invalid_code_point(cp);
+
+        if (cp < 0x80)                        // one octet
+            *(result++) = static_cast<uint8_t>(cp);
+        else if (cp < 0x800) {                // two octets
+            *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
+            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
+        }
+        else if (cp < 0x10000) {              // three octets
+            *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
+            *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
+            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
+        }
+        else {                                // four octets
+            *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
+            *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
+            *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
+            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
+        }
+        return result;
+    }
+
+    template <typename octet_iterator, typename output_iterator>
+    output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
+    {
+        while (start != end) {
+            octet_iterator sequence_start = start;
+            internal::utf_error err_code = utf8::internal::validate_next(start, end);
+            switch (err_code) {
+                case internal::UTF8_OK :
+                    for (octet_iterator it = sequence_start; it != start; ++it)
+                        *out++ = *it;
+                    break;
+                case internal::NOT_ENOUGH_ROOM:
+                    out = utf8::append (replacement, out);
+                    start = end;
+                    break;
+                case internal::INVALID_LEAD:
+                    out = utf8::append (replacement, out);
+                    ++start;
+                    break;
+                case internal::INCOMPLETE_SEQUENCE:
+                case internal::OVERLONG_SEQUENCE:
+                case internal::INVALID_CODE_POINT:
+                    out = utf8::append (replacement, out);
+                    ++start;
+                    // just one replacement mark for the sequence
+                    while (start != end && utf8::internal::is_trail(*start))
+                        ++start;
+                    break;
+            }
+        }
+        return out;
+    }
+
+    template <typename octet_iterator, typename output_iterator>
+    inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
+    {
+        static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
+        return utf8::replace_invalid(start, end, out, replacement_marker);
+    }
+
+    template <typename octet_iterator>
+    uint32_t next(octet_iterator& it, octet_iterator end)
+    {
+        uint32_t cp = 0;
+        internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
+        switch (err_code) {
+            case internal::UTF8_OK :
+                break;
+            case internal::NOT_ENOUGH_ROOM :
+                throw not_enough_room();
+            case internal::INVALID_LEAD :
+            case internal::INCOMPLETE_SEQUENCE :
+            case internal::OVERLONG_SEQUENCE :
+                throw invalid_utf8(*it);
+            case internal::INVALID_CODE_POINT :
+                throw invalid_code_point(cp);
+        }
+        return cp;
+    }
+
+    template <typename octet_iterator>
+    uint32_t peek_next(octet_iterator it, octet_iterator end)
+    {
+        return utf8::next(it, end);
+    }
+
+    template <typename octet_iterator>
+    uint32_t prior(octet_iterator& it, octet_iterator start)
+    {
+        // can't do much if it == start
+        if (it == start)
+            throw not_enough_room();
+
+        octet_iterator end = it;
+        // Go back until we hit either a lead octet or start
+        while (utf8::internal::is_trail(*(--it)))
+            if (it == start)
+                throw invalid_utf8(*it); // error - no lead byte in the sequence
+        return utf8::peek_next(it, end);
+    }
+
+    template <typename octet_iterator, typename distance_type>
+    void advance (octet_iterator& it, distance_type n, octet_iterator end)
+    {
+        const distance_type zero(0);
+        if (n < zero) {
+            // backward
+            for (distance_type i = n; i < zero; ++i)
+                utf8::prior(it, end);
+        } else {
+            // forward
+            for (distance_type i = zero; i < n; ++i)
+                utf8::next(it, end);
+        }
+    }
+
+    template <typename octet_iterator>
+    typename std::iterator_traits<octet_iterator>::difference_type
+    distance (octet_iterator first, octet_iterator last)
+    {
+        typename std::iterator_traits<octet_iterator>::difference_type dist;
+        for (dist = 0; first < last; ++dist)
+            utf8::next(first, last);
+        return dist;
+    }
+
+    template <typename u16bit_iterator, typename octet_iterator>
+    octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
+    {
+        while (start != end) {
+            uint32_t cp = utf8::internal::mask16(*start++);
+            // Take care of surrogate pairs first
+            if (utf8::internal::is_lead_surrogate(cp)) {
+                if (start != end) {
+                    uint32_t trail_surrogate = utf8::internal::mask16(*start++);
+                    if (utf8::internal::is_trail_surrogate(trail_surrogate))
+                        cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
+                    else
+                        throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
+                }
+                else
+                    throw invalid_utf16(static_cast<uint16_t>(cp));
+
+            }
+            // Lone trail surrogate
+            else if (utf8::internal::is_trail_surrogate(cp))
+                throw invalid_utf16(static_cast<uint16_t>(cp));
+
+            result = utf8::append(cp, result);
+        }
+        return result;
+    }
+
+    template <typename u16bit_iterator, typename octet_iterator>
+    u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
+    {
+        while (start < end) {
+            uint32_t cp = utf8::next(start, end);
+            if (cp > 0xffff) { //make a surrogate pair
+                *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
+                *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
+            }
+            else
+                *result++ = static_cast<uint16_t>(cp);
+        }
+        return result;
+    }
+
+    template <typename octet_iterator, typename u32bit_iterator>
+    octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
+    {
+        while (start != end)
+            result = utf8::append(*(start++), result);
+
+        return result;
+    }
+
+    template <typename octet_iterator, typename u32bit_iterator>
+    u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
+    {
+        while (start < end)
+            (*result++) = utf8::next(start, end);
+
+        return result;
+    }
+
+    // The iterator class
+    template <typename octet_iterator>
+    class iterator {
+      octet_iterator it;
+      octet_iterator range_start;
+      octet_iterator range_end;
+      public:
+      typedef uint32_t value_type;
+      typedef uint32_t* pointer;
+      typedef uint32_t& reference;
+      typedef std::ptrdiff_t difference_type;
+      typedef std::bidirectional_iterator_tag iterator_category;
+      iterator () {}
+      explicit iterator (const octet_iterator& octet_it,
+                         const octet_iterator& rangestart,
+                         const octet_iterator& rangeend) :
+               it(octet_it), range_start(rangestart), range_end(rangeend)
+      {
+          if (it < range_start || it > range_end)
+              throw std::out_of_range("Invalid utf-8 iterator position");
+      }
+      // the default "big three" are OK
+      octet_iterator base () const { return it; }
+      uint32_t operator * () const
+      {
+          octet_iterator temp = it;
+          return utf8::next(temp, range_end);
+      }
+      bool operator == (const iterator& rhs) const
+      {
+          if (range_start != rhs.range_start || range_end != rhs.range_end)
+              throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
+          return (it == rhs.it);
+      }
+      bool operator != (const iterator& rhs) const
+      {
+          return !(operator == (rhs));
+      }
+      iterator& operator ++ ()
+      {
+          utf8::next(it, range_end);
+          return *this;
+      }
+      iterator operator ++ (int)
+      {
+          iterator temp = *this;
+          utf8::next(it, range_end);
+          return temp;
+      }
+      iterator& operator -- ()
+      {
+          utf8::prior(it, range_start);
+          return *this;
+      }
+      iterator operator -- (int)
+      {
+          iterator temp = *this;
+          utf8::prior(it, range_start);
+          return temp;
+      }
+    }; // class iterator
+
+} // namespace utf8
+
+#if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later
+#include "cpp17.h"
+#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
+#include "cpp11.h"
+#endif // C++ 11 or later
+
+#endif //header guard
+
--- a/mlu_370-piper/piper/src/cpp/utf8/core.h
+++ b/mlu_370-piper/piper/src/cpp/utf8/core.h
@@ -0,0 +1,338 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include <iterator>
+
+// Determine the C++ standard version.
+// If the user defines UTF_CPP_CPLUSPLUS, use that.
+// Otherwise, trust the unreliable predefined macro __cplusplus
+
+#if !defined UTF_CPP_CPLUSPLUS
+    #define UTF_CPP_CPLUSPLUS __cplusplus
+#endif
+
+#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
+    #define UTF_CPP_OVERRIDE override
+    #define UTF_CPP_NOEXCEPT noexcept
+#else // C++ 98/03
+    #define UTF_CPP_OVERRIDE
+    #define UTF_CPP_NOEXCEPT throw()
+#endif // C++ 11 or later
+
+
+namespace utf8
+{
+    // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
+    // You may need to change them to match your system.
+    // These typedefs have the same names as ones from cstdint, or boost/cstdint
+    typedef unsigned char   uint8_t;
+    typedef unsigned short  uint16_t;
+    typedef unsigned int    uint32_t;
+
+// Helper code - not intended to be directly called by the library users. May be changed at any time
+namespace internal
+{
+    // Unicode constants
+    // Leading (high) surrogates: 0xd800 - 0xdbff
+    // Trailing (low) surrogates: 0xdc00 - 0xdfff
+    const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
+    const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
+    const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
+    const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
+    const uint16_t LEAD_OFFSET         = 0xd7c0u;       // LEAD_SURROGATE_MIN - (0x10000 >> 10)
+    const uint32_t SURROGATE_OFFSET    = 0xfca02400u;   // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
+
+    // Maximum valid value for a Unicode code point
+    const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
+
+    template<typename octet_type>
+    inline uint8_t mask8(octet_type oc)
+    {
+        return static_cast<uint8_t>(0xff & oc);
+    }
+    template<typename u16_type>
+    inline uint16_t mask16(u16_type oc)
+    {
+        return static_cast<uint16_t>(0xffff & oc);
+    }
+    template<typename octet_type>
+    inline bool is_trail(octet_type oc)
+    {
+        return ((utf8::internal::mask8(oc) >> 6) == 0x2);
+    }
+
+    template <typename u16>
+    inline bool is_lead_surrogate(u16 cp)
+    {
+        return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
+    }
+
+    template <typename u16>
+    inline bool is_trail_surrogate(u16 cp)
+    {
+        return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
+    }
+
+    template <typename u16>
+    inline bool is_surrogate(u16 cp)
+    {
+        return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
+    }
+
+    template <typename u32>
+    inline bool is_code_point_valid(u32 cp)
+    {
+        return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
+    }
+
+    template <typename octet_iterator>
+    inline typename std::iterator_traits<octet_iterator>::difference_type
+    sequence_length(octet_iterator lead_it)
+    {
+        uint8_t lead = utf8::internal::mask8(*lead_it);
+        if (lead < 0x80)
+            return 1;
+        else if ((lead >> 5) == 0x6)
+            return 2;
+        else if ((lead >> 4) == 0xe)
+            return 3;
+        else if ((lead >> 3) == 0x1e)
+            return 4;
+        else
+            return 0;
+    }
+
+    template <typename octet_difference_type>
+    inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
+    {
+        if (cp < 0x80) {
+            if (length != 1) 
+                return true;
+        }
+        else if (cp < 0x800) {
+            if (length != 2) 
+                return true;
+        }
+        else if (cp < 0x10000) {
+            if (length != 3) 
+                return true;
+        }
+
+        return false;
+    }
+
+    enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
+
+    /// Helper for get_sequence_x
+    template <typename octet_iterator>
+    utf_error increase_safely(octet_iterator& it, octet_iterator end)
+    {
+        if (++it == end)
+            return NOT_ENOUGH_ROOM;
+
+        if (!utf8::internal::is_trail(*it))
+            return INCOMPLETE_SEQUENCE;
+
+        return UTF8_OK;
+    }
+
+    #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}    
+
+    /// get_sequence_x functions decode utf-8 sequences of the length x
+    template <typename octet_iterator>
+    utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+    {
+        if (it == end)
+            return NOT_ENOUGH_ROOM;
+
+        code_point = utf8::internal::mask8(*it);
+
+        return UTF8_OK;
+    }
+
+    template <typename octet_iterator>
+    utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+    {
+        if (it == end) 
+            return NOT_ENOUGH_ROOM;
+
+        code_point = utf8::internal::mask8(*it);
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
+
+        return UTF8_OK;
+    }
+
+    template <typename octet_iterator>
+    utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+    {
+        if (it == end)
+            return NOT_ENOUGH_ROOM;
+            
+        code_point = utf8::internal::mask8(*it);
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point += (*it) & 0x3f;
+
+        return UTF8_OK;
+    }
+
+    template <typename octet_iterator>
+    utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+    {
+        if (it == end)
+           return NOT_ENOUGH_ROOM;
+
+        code_point = utf8::internal::mask8(*it);
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point += (*it) & 0x3f;
+
+        return UTF8_OK;
+    }
+
+    #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
+
+    template <typename octet_iterator>
+    utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+    {
+        if (it == end)
+            return NOT_ENOUGH_ROOM;
+
+        // Save the original value of it so we can go back in case of failure
+        // Of course, it does not make much sense with i.e. stream iterators
+        octet_iterator original_it = it;
+
+        uint32_t cp = 0;
+        // Determine the sequence length based on the lead octet
+        typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
+        const octet_difference_type length = utf8::internal::sequence_length(it);
+
+        // Get trail octets and calculate the code point
+        utf_error err = UTF8_OK;
+        switch (length) {
+            case 0:
+                return INVALID_LEAD;
+            case 1:
+                err = utf8::internal::get_sequence_1(it, end, cp);
+                break;
+            case 2:
+                err = utf8::internal::get_sequence_2(it, end, cp);
+            break;
+            case 3:
+                err = utf8::internal::get_sequence_3(it, end, cp);
+            break;
+            case 4:
+                err = utf8::internal::get_sequence_4(it, end, cp);
+            break;
+        }
+
+        if (err == UTF8_OK) {
+            // Decoding succeeded. Now, security checks...
+            if (utf8::internal::is_code_point_valid(cp)) {
+                if (!utf8::internal::is_overlong_sequence(cp, length)){
+                    // Passed! Return here.
+                    code_point = cp;
+                    ++it;
+                    return UTF8_OK;
+                }
+                else
+                    err = OVERLONG_SEQUENCE;
+            }
+            else 
+                err = INVALID_CODE_POINT;
+        }
+
+        // Failure branch - restore the original value of the iterator
+        it = original_it;
+        return err;
+    }
+
+    template <typename octet_iterator>
+    inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
+        uint32_t ignored;
+        return utf8::internal::validate_next(it, end, ignored);
+    }
+
+} // namespace internal
+
+    /// The library API - functions intended to be called by the users
+
+    // Byte order mark
+    const uint8_t bom[] = {0xef, 0xbb, 0xbf};
+
+    template <typename octet_iterator>
+    octet_iterator find_invalid(octet_iterator start, octet_iterator end)
+    {
+        octet_iterator result = start;
+        while (result != end) {
+            utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
+            if (err_code != internal::UTF8_OK)
+                return result;
+        }
+        return result;
+    }
+
+    template <typename octet_iterator>
+    inline bool is_valid(octet_iterator start, octet_iterator end)
+    {
+        return (utf8::find_invalid(start, end) == end);
+    }
+
+    template <typename octet_iterator>
+    inline bool starts_with_bom (octet_iterator it, octet_iterator end)
+    {
+        return (
+            ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
+            ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
+            ((it != end) && (utf8::internal::mask8(*it))   == bom[2])
+           );
+    }	
+} // namespace utf8
+
+#endif // header guard
+
+
--- a/mlu_370-piper/piper/src/cpp/utf8/cpp11.h
+++ b/mlu_370-piper/piper/src/cpp/utf8/cpp11.h
@@ -0,0 +1,103 @@
+// Copyright 2018 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
+#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
+
+#include "checked.h"
+#include <string>
+
+namespace utf8
+{
+
+    inline void append(char32_t cp, std::string& s)
+    {
+        append(uint32_t(cp), std::back_inserter(s));
+    }
+
+    inline std::string utf16to8(const std::u16string& s)
+    {
+        std::string result;
+        utf16to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::u16string utf8to16(const std::string& s)
+    {
+        std::u16string result;
+        utf8to16(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::string utf32to8(const std::u32string& s)
+    {
+        std::string result;
+        utf32to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::u32string utf8to32(const std::string& s)
+    {
+        std::u32string result;
+        utf8to32(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::size_t find_invalid(const std::string& s)
+    {
+        std::string::const_iterator invalid = find_invalid(s.begin(), s.end());
+        return (invalid == s.end()) ? std::string::npos : (invalid - s.begin());
+    }
+
+    inline bool is_valid(const std::string& s)
+    {
+        return is_valid(s.begin(), s.end());
+    }
+
+    inline std::string replace_invalid(const std::string& s, char32_t replacement)
+    {
+        std::string result;
+        replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
+        return result;
+    }
+
+    inline std::string replace_invalid(const std::string& s)
+    {
+        std::string result;
+        replace_invalid(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline bool starts_with_bom(const std::string& s)
+    {
+        return starts_with_bom(s.begin(), s.end());
+    }
+ 
+} // namespace utf8
+
+#endif // header guard
+
--- a/mlu_370-piper/piper/src/cpp/utf8/cpp17.h
+++ b/mlu_370-piper/piper/src/cpp/utf8/cpp17.h
@@ -0,0 +1,103 @@
+// Copyright 2018 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
+#define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
+
+#include "checked.h"
+#include <string>
+
+namespace utf8
+{
+
+    inline void append(char32_t cp, std::string& s)
+    {
+        append(uint32_t(cp), std::back_inserter(s));
+    }
+
+    inline std::string utf16to8(std::u16string_view s)
+    {
+        std::string result;
+        utf16to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::u16string utf8to16(std::string_view s)
+    {
+        std::u16string result;
+        utf8to16(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::string utf32to8(std::u32string_view s)
+    {
+        std::string result;
+        utf32to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::u32string utf8to32(std::string_view s)
+    {
+        std::u32string result;
+        utf8to32(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::size_t find_invalid(std::string_view s)
+    {
+        std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end());
+        return (invalid == s.end()) ? std::string_view::npos : (invalid - s.begin());
+    }
+
+    inline bool is_valid(std::string_view s)
+    {
+        return is_valid(s.begin(), s.end());
+    }
+
+    inline std::string replace_invalid(std::string_view s, char32_t replacement)
+    {
+        std::string result;
+        replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
+        return result;
+    }
+
+    inline std::string replace_invalid(std::string_view s)
+    {
+        std::string result;
+        replace_invalid(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline bool starts_with_bom(std::string_view s)
+    {
+        return starts_with_bom(s.begin(), s.end());
+    }
+ 
+} // namespace utf8
+
+#endif // header guard
+
--- a/mlu_370-piper/piper/src/cpp/utf8/unchecked.h
+++ b/mlu_370-piper/piper/src/cpp/utf8/unchecked.h
@@ -0,0 +1,274 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include "core.h"
+
+namespace utf8
+{
+    namespace unchecked
+    {
+        template <typename octet_iterator>
+        octet_iterator append(uint32_t cp, octet_iterator result)
+        {
+            if (cp < 0x80)                        // one octet
+                *(result++) = static_cast<uint8_t>(cp);
+            else if (cp < 0x800) {                // two octets
+                *(result++) = static_cast<uint8_t>((cp >> 6)          | 0xc0);
+                *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
+            }
+            else if (cp < 0x10000) {              // three octets
+                *(result++) = static_cast<uint8_t>((cp >> 12)         | 0xe0);
+                *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
+                *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
+            }
+            else {                                // four octets
+                *(result++) = static_cast<uint8_t>((cp >> 18)         | 0xf0);
+                *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
+                *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
+                *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
+            }
+            return result;
+        }
+
+        template <typename octet_iterator, typename output_iterator>
+        output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
+        {
+            while (start != end) {
+                octet_iterator sequence_start = start;
+                internal::utf_error err_code = utf8::internal::validate_next(start, end);
+                switch (err_code) {
+                    case internal::UTF8_OK :
+                        for (octet_iterator it = sequence_start; it != start; ++it)
+                            *out++ = *it;
+                        break;
+                    case internal::NOT_ENOUGH_ROOM:
+                        out = utf8::unchecked::append (replacement, out);
+                        start = end;
+                        break;
+                    case internal::INVALID_LEAD:
+                        out = utf8::unchecked::append (replacement, out);
+                        ++start;
+                        break;
+                    case internal::INCOMPLETE_SEQUENCE:
+                    case internal::OVERLONG_SEQUENCE:
+                    case internal::INVALID_CODE_POINT:
+                        out = utf8::unchecked::append (replacement, out);
+                        ++start;
+                        // just one replacement mark for the sequence
+                        while (start != end && utf8::internal::is_trail(*start))
+                            ++start;
+                        break;
+                }
+            }
+            return out;
+        }
+
+        template <typename octet_iterator, typename output_iterator>
+        inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
+        {
+            static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
+            return utf8::unchecked::replace_invalid(start, end, out, replacement_marker);
+        }
+
+        template <typename octet_iterator>
+        uint32_t next(octet_iterator& it)
+        {
+            uint32_t cp = utf8::internal::mask8(*it);
+            typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
+            switch (length) {
+                case 1:
+                    break;
+                case 2:
+                    it++;
+                    cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
+                    break;
+                case 3:
+                    ++it; 
+                    cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
+                    ++it;
+                    cp += (*it) & 0x3f;
+                    break;
+                case 4:
+                    ++it;
+                    cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);                
+                    ++it;
+                    cp += (utf8::internal::mask8(*it) << 6) & 0xfff;
+                    ++it;
+                    cp += (*it) & 0x3f; 
+                    break;
+            }
+            ++it;
+            return cp;
+        }
+
+        template <typename octet_iterator>
+        uint32_t peek_next(octet_iterator it)
+        {
+            return utf8::unchecked::next(it);
+        }
+
+        template <typename octet_iterator>
+        uint32_t prior(octet_iterator& it)
+        {
+            while (utf8::internal::is_trail(*(--it))) ;
+            octet_iterator temp = it;
+            return utf8::unchecked::next(temp);
+        }
+
+        template <typename octet_iterator, typename distance_type>
+        void advance (octet_iterator& it, distance_type n)
+        {
+            const distance_type zero(0);
+            if (n < zero) {
+                // backward
+                for (distance_type i = n; i < zero; ++i)
+                    utf8::unchecked::prior(it);
+            } else {
+                // forward
+                for (distance_type i = zero; i < n; ++i)
+                    utf8::unchecked::next(it);
+            }
+        }
+
+        template <typename octet_iterator>
+        typename std::iterator_traits<octet_iterator>::difference_type
+        distance (octet_iterator first, octet_iterator last)
+        {
+            typename std::iterator_traits<octet_iterator>::difference_type dist;
+            for (dist = 0; first < last; ++dist) 
+                utf8::unchecked::next(first);
+            return dist;
+        }
+
+        template <typename u16bit_iterator, typename octet_iterator>
+        octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
+        {
+            while (start != end) {
+                uint32_t cp = utf8::internal::mask16(*start++);
+            // Take care of surrogate pairs first
+                if (utf8::internal::is_lead_surrogate(cp)) {
+                    uint32_t trail_surrogate = utf8::internal::mask16(*start++);
+                    cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
+                }
+                result = utf8::unchecked::append(cp, result);
+            }
+            return result;
+        }
+
+        template <typename u16bit_iterator, typename octet_iterator>
+        u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
+        {
+            while (start < end) {
+                uint32_t cp = utf8::unchecked::next(start);
+                if (cp > 0xffff) { //make a surrogate pair
+                    *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
+                    *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
+                }
+                else
+                    *result++ = static_cast<uint16_t>(cp);
+            }
+            return result;
+        }
+
+        template <typename octet_iterator, typename u32bit_iterator>
+        octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
+        {
+            while (start != end)
+                result = utf8::unchecked::append(*(start++), result);
+
+            return result;
+        }
+
+        template <typename octet_iterator, typename u32bit_iterator>
+        u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
+        {
+            while (start < end)
+                (*result++) = utf8::unchecked::next(start);
+
+            return result;
+        }
+
+        // The iterator class
+        template <typename octet_iterator>
+          class iterator {
+            octet_iterator it;
+            public:
+            typedef uint32_t value_type;
+            typedef uint32_t* pointer;
+            typedef uint32_t& reference;
+            typedef std::ptrdiff_t difference_type;
+            typedef std::bidirectional_iterator_tag iterator_category;
+            iterator () {}
+            explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
+            // the default "big three" are OK
+            octet_iterator base () const { return it; }
+            uint32_t operator * () const
+            {
+                octet_iterator temp = it;
+                return utf8::unchecked::next(temp);
+            }
+            bool operator == (const iterator& rhs) const 
+            { 
+                return (it == rhs.it);
+            }
+            bool operator != (const iterator& rhs) const
+            {
+                return !(operator == (rhs));
+            }
+            iterator& operator ++ () 
+            {
+                ::std::advance(it, utf8::internal::sequence_length(it));
+                return *this;
+            }
+            iterator operator ++ (int)
+            {
+                iterator temp = *this;
+                ::std::advance(it, utf8::internal::sequence_length(it));
+                return temp;
+            }  
+            iterator& operator -- ()
+            {
+                utf8::unchecked::prior(it);
+                return *this;
+            }
+            iterator operator -- (int)
+            {
+                iterator temp = *this;
+                utf8::unchecked::prior(it);
+                return temp;
+            }
+          }; // class iterator
+
+    } // namespace utf8::unchecked
+} // namespace utf8 
+
+
+#endif // header guard
+
--- a/mlu_370-piper/piper/src/cpp/wavfile.hpp
+++ b/mlu_370-piper/piper/src/cpp/wavfile.hpp
@@ -0,0 +1,40 @@
+#ifndef WAVFILE_H_
+#define WAVFILE_H_
+
+#include <iostream>
+
+struct WavHeader {
+  uint8_t RIFF[4] = {'R', 'I', 'F', 'F'};
+  uint32_t chunkSize;
+  uint8_t WAVE[4] = {'W', 'A', 'V', 'E'};
+
+  // fmt
+  uint8_t fmt[4] = {'f', 'm', 't', ' '};
+  uint32_t fmtSize = 16;    // bytes
+  uint16_t audioFormat = 1; // PCM
+  uint16_t numChannels;     // mono
+  uint32_t sampleRate;      // Hertz
+  uint32_t bytesPerSec;     // sampleRate * sampleWidth
+  uint16_t blockAlign = 2;  // 16-bit mono
+  uint16_t bitsPerSample = 16;
+
+  // data
+  uint8_t data[4] = {'d', 'a', 't', 'a'};
+  uint32_t dataSize;
+};
+
+// Write WAV file header only
+void writeWavHeader(int sampleRate, int sampleWidth, int channels,
+                    uint32_t numSamples, std::ostream &audioFile) {
+  WavHeader header;
+  header.dataSize = numSamples * sampleWidth * channels;
+  header.chunkSize = header.dataSize + sizeof(WavHeader) - 8;
+  header.sampleRate = sampleRate;
+  header.numChannels = channels;
+  header.bytesPerSec = sampleRate * sampleWidth * channels;
+  header.blockAlign = sampleWidth * channels;
+  audioFile.write(reinterpret_cast<const char *>(&header), sizeof(header));
+
+} /* writeWavHeader */
+
+#endif // WAVFILE_H_
--- a/mlu_370-piper/piper/src/python/.dockerignore
+++ b/mlu_370-piper/piper/src/python/.dockerignore
@@ -0,0 +1 @@
+*
--- a/mlu_370-piper/piper/src/python/Dockerfile
+++ b/mlu_370-piper/piper/src/python/Dockerfile
@@ -0,0 +1,6 @@
+FROM nvcr.io/nvidia/pytorch:22.03-py3
+
+RUN pip3 install \
+    'pytorch-lightning~=1.7.0'
+
+ENV NUMBA_CACHE_DIR=.numba_cache
--- a/mlu_370-piper/piper/src/python/README.md
+++ b/mlu_370-piper/piper/src/python/README.md
--- a/mlu_370-piper/piper/src/python/build_monotonic_align.sh
+++ b/mlu_370-piper/piper/src/python/build_monotonic_align.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+this_dir="$( cd "$( dirname "$0" )" && pwd )"
+
+if [ -d "${this_dir}/.venv" ]; then
+    source "${this_dir}/.venv/bin/activate"
+fi
+
+cd "${this_dir}/piper_train/vits/monotonic_align"
+mkdir -p monotonic_align
+cythonize -i core.pyx
+mv core*.so monotonic_align/
--- a/mlu_370-piper/piper/src/python/mypy.ini
+++ b/mlu_370-piper/piper/src/python/mypy.ini
@@ -0,0 +1,11 @@
+
+[mypy]
+
+[mypy-setuptools.*]
+ignore_missing_imports = True
+
+[mypy-librosa.*]
+ignore_missing_imports = True
+
+[mypy-onnxruntime.*]
+ignore_missing_imports = True
--- a/mlu_370-piper/piper/src/python/piper_train/.gitignore
+++ b/mlu_370-piper/piper/src/python/piper_train/.gitignore
@@ -0,0 +1,11 @@
+.DS_Store
+.idea
+*.log
+tmp/
+
+*.py[cod]
+*.egg
+build
+htmlcov
+
+.venv/
--- a/mlu_370-piper/piper/src/python/piper_train/.isort.cfg
+++ b/mlu_370-piper/piper/src/python/piper_train/.isort.cfg
@@ -0,0 +1,6 @@
+[settings]
+multi_line_output=3
+include_trailing_comma=True
+force_grid_wrap=0
+use_parentheses=True
+line_length=88
--- a/mlu_370-piper/piper/src/python/piper_train/VERSION
+++ b/mlu_370-piper/piper/src/python/piper_train/VERSION
@@ -0,0 +1 @@
+1.0.0
--- a/mlu_370-piper/piper/src/python/piper_train/init.py
+++ b/mlu_370-piper/piper/src/python/piper_train/init.py
--- a/mlu_370-piper/piper/src/python/piper_train/main.py
+++ b/mlu_370-piper/piper/src/python/piper_train/main.py
@@ -0,0 +1,147 @@
+import argparse
+import json
+import logging
+from pathlib import Path
+
+import torch
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
+
+from .vits.lightning import VitsModel
+
+_LOGGER = logging.getLogger(__package__)
+
+
+def main():
+    logging.basicConfig(level=logging.DEBUG)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset-dir", required=True, help="Path to pre-processed dataset directory"
+    )
+    parser.add_argument(
+        "--checkpoint-epochs",
+        type=int,
+        help="Save checkpoint every N epochs (default: 1)",
+    )
+    parser.add_argument(
+        "--quality",
+        default="medium",
+        choices=("x-low", "medium", "high"),
+        help="Quality/size of model (default: medium)",
+    )
+    parser.add_argument(
+        "--resume_from_single_speaker_checkpoint",
+        help="For multi-speaker models only. Converts a single-speaker checkpoint to multi-speaker and resumes training",
+    )
+    Trainer.add_argparse_args(parser)
+    VitsModel.add_model_specific_args(parser)
+    parser.add_argument("--seed", type=int, default=1234)
+    args = parser.parse_args()
+    _LOGGER.debug(args)
+
+    args.dataset_dir = Path(args.dataset_dir)
+    if not args.default_root_dir:
+        args.default_root_dir = args.dataset_dir
+
+    torch.backends.cudnn.benchmark = True
+    torch.manual_seed(args.seed)
+
+    config_path = args.dataset_dir / "config.json"
+    dataset_path = args.dataset_dir / "dataset.jsonl"
+
+    with open(config_path, "r", encoding="utf-8") as config_file:
+        # See preprocess.py for format
+        config = json.load(config_file)
+        num_symbols = int(config["num_symbols"])
+        num_speakers = int(config["num_speakers"])
+        sample_rate = int(config["audio"]["sample_rate"])
+
+    trainer = Trainer.from_argparse_args(args)
+    if args.checkpoint_epochs is not None:
+        trainer.callbacks = [ModelCheckpoint(every_n_epochs=args.checkpoint_epochs)]
+        _LOGGER.debug(
+            "Checkpoints will be saved every %s epoch(s)", args.checkpoint_epochs
+        )
+
+    dict_args = vars(args)
+    if args.quality == "x-low":
+        dict_args["hidden_channels"] = 96
+        dict_args["inter_channels"] = 96
+        dict_args["filter_channels"] = 384
+    elif args.quality == "high":
+        dict_args["resblock"] = "1"
+        dict_args["resblock_kernel_sizes"] = (3, 7, 11)
+        dict_args["resblock_dilation_sizes"] = (
+            (1, 3, 5),
+            (1, 3, 5),
+            (1, 3, 5),
+        )
+        dict_args["upsample_rates"] = (8, 8, 2, 2)
+        dict_args["upsample_initial_channel"] = 512
+        dict_args["upsample_kernel_sizes"] = (16, 16, 4, 4)
+
+    model = VitsModel(
+        num_symbols=num_symbols,
+        num_speakers=num_speakers,
+        sample_rate=sample_rate,
+        dataset=[dataset_path],
+        **dict_args,
+    )
+
+    if args.resume_from_single_speaker_checkpoint:
+        assert (
+            num_speakers > 1
+        ), "--resume_from_single_speaker_checkpoint is only for multi-speaker models. Use --resume_from_checkpoint for single-speaker models."
+
+        # Load single-speaker checkpoint
+        _LOGGER.debug(
+            "Resuming from single-speaker checkpoint: %s",
+            args.resume_from_single_speaker_checkpoint,
+        )
+        model_single = VitsModel.load_from_checkpoint(
+            args.resume_from_single_speaker_checkpoint,
+            dataset=None,
+        )
+        g_dict = model_single.model_g.state_dict()
+        for key in list(g_dict.keys()):
+            # Remove keys that can't be copied over due to missing speaker embedding
+            if (
+                key.startswith("dec.cond")
+                or key.startswith("dp.cond")
+                or ("enc.cond_layer" in key)
+            ):
+                g_dict.pop(key, None)
+
+        # Copy over the multi-speaker model, excluding keys related to the
+        # speaker embedding (which is missing from the single-speaker model).
+        load_state_dict(model.model_g, g_dict)
+        load_state_dict(model.model_d, model_single.model_d.state_dict())
+        _LOGGER.info(
+            "Successfully converted single-speaker checkpoint to multi-speaker"
+        )
+
+    trainer.fit(model)
+
+
+def load_state_dict(model, saved_state_dict):
+    state_dict = model.state_dict()
+    new_state_dict = {}
+
+    for k, v in state_dict.items():
+        if k in saved_state_dict:
+            # Use saved value
+            new_state_dict[k] = saved_state_dict[k]
+        else:
+            # Use initialized value
+            _LOGGER.debug("%s is not in the checkpoint", k)
+            new_state_dict[k] = v
+
+    model.load_state_dict(new_state_dict)
+
+
+# -----------------------------------------------------------------------------
+
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/piper_train/_resources.py
+++ b/mlu_370-piper/piper/src/python/piper_train/_resources.py
@@ -0,0 +1,19 @@
+"""Shared access to package resources"""
+import os
+import typing
+from pathlib import Path
+
+try:
+    import importlib.resources
+
+    files = importlib.resources.files
+except (ImportError, AttributeError):
+    # Backport for Python < 3.9
+    import importlib_resources  # type: ignore
+
+    files = importlib_resources.files
+
+_PACKAGE = "piper_train"
+_DIR = Path(typing.cast(os.PathLike, files(_PACKAGE)))
+
+__version__ = (_DIR / "VERSION").read_text(encoding="utf-8").strip()
--- a/mlu_370-piper/piper/src/python/piper_train/check_phonemes.py
+++ b/mlu_370-piper/piper/src/python/piper_train/check_phonemes.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+import json
+import sys
+import unicodedata
+from collections import Counter
+
+from .phonemize import DEFAULT_PHONEME_ID_MAP
+
+
+def main() -> None:
+    used_phonemes: "Counter[str]" = Counter()
+    missing_phonemes: "Counter[str]" = Counter()
+
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+
+        utt = json.loads(line)
+        for phoneme in utt["phonemes"]:
+            used_phonemes[phoneme] += 1
+
+            if phoneme not in DEFAULT_PHONEME_ID_MAP:
+                missing_phonemes[phoneme] += 1
+
+    if missing_phonemes:
+        print("Missing", len(missing_phonemes), "phoneme(s)", file=sys.stderr)
+
+    json.dump(
+        {
+            "used": {
+                phoneme: {
+                    "count": count,
+                    "hex": f"\\u{hex(ord(phoneme))}",
+                    "name": unicodedata.category(phoneme),
+                    "category": unicodedata.category(phoneme),
+                }
+                for phoneme, count in used_phonemes.most_common()
+            },
+            "missing": {
+                phoneme: {
+                    "count": count,
+                    "hex": f"\\u{hex(ord(phoneme))}",
+                    "name": unicodedata.category(phoneme),
+                    "category": unicodedata.category(phoneme),
+                }
+                for phoneme, count in missing_phonemes.most_common()
+            },
+        },
+        sys.stdout,
+    )
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/piper_train/clean_cached_audio.py
+++ b/mlu_370-piper/piper/src/python/piper_train/clean_cached_audio.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+import argparse
+from concurrent.futures import ThreadPoolExecutor
+import logging
+from pathlib import Path
+
+import torch
+
+_LOGGER = logging.getLogger()
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--cache-dir",
+        required=True,
+        help="Path to directory with audio/spectrogram files (*.pt)",
+    )
+    parser.add_argument(
+        "--delete", action="store_true", help="Delete files that fail to load"
+    )
+    parser.add_argument("--debug", action="store_true")
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
+    _LOGGER.debug(args)
+
+    cache_dir = Path(args.cache_dir)
+    num_deleted = 0
+
+    def check_file(pt_path: Path) -> None:
+        nonlocal num_deleted
+
+        try:
+            _LOGGER.debug("Checking %s", pt_path)
+            torch.load(str(pt_path))
+        except Exception:
+            _LOGGER.error(pt_path)
+            if args.delete:
+                pt_path.unlink()
+                num_deleted += 1
+
+    with ThreadPoolExecutor() as executor:
+        for pt_path in cache_dir.glob("*.pt"):
+            executor.submit(check_file, pt_path)
+
+    print("Deleted:", num_deleted, "file(s)")
+
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/piper_train/export_generator.py
+++ b/mlu_370-piper/piper/src/python/piper_train/export_generator.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+from pathlib import Path
+
+import torch
+
+from .vits.lightning import VitsModel
+
+_LOGGER = logging.getLogger("piper_train.export_generator")
+
+
+def main():
+    """Main entry point"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint", help="Path to model checkpoint (.ckpt)")
+    parser.add_argument("output", help="Path to output model (.pt)")
+
+    parser.add_argument(
+        "--debug", action="store_true", help="Print DEBUG messages to the console"
+    )
+    args = parser.parse_args()
+
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    _LOGGER.debug(args)
+
+    # -------------------------------------------------------------------------
+
+    args.checkpoint = Path(args.checkpoint)
+    args.output = Path(args.output)
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+
+    model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
+    model_g = model.model_g
+
+    # Inference only
+    model_g.eval()
+
+    with torch.no_grad():
+        model_g.dec.remove_weight_norm()
+
+    model_g.forward = model_g.infer
+
+    torch.save(model_g, args.output)
+
+    _LOGGER.info("Exported model to %s", args.output)
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/piper_train/export_onnx.py
+++ b/mlu_370-piper/piper/src/python/piper_train/export_onnx.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+from pathlib import Path
+from typing import Optional
+
+import torch
+
+from .vits.lightning import VitsModel
+
+_LOGGER = logging.getLogger("piper_train.export_onnx")
+
+OPSET_VERSION = 15
+
+
+def main() -> None:
+    """Main entry point"""
+    torch.manual_seed(1234)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint", help="Path to model checkpoint (.ckpt)")
+    parser.add_argument("output", help="Path to output model (.onnx)")
+
+    parser.add_argument(
+        "--debug", action="store_true", help="Print DEBUG messages to the console"
+    )
+    args = parser.parse_args()
+
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    _LOGGER.debug(args)
+
+    # -------------------------------------------------------------------------
+
+    args.checkpoint = Path(args.checkpoint)
+    args.output = Path(args.output)
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+
+    model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
+    model_g = model.model_g
+
+    num_symbols = model_g.n_vocab
+    num_speakers = model_g.n_speakers
+
+    # Inference only
+    model_g.eval()
+
+    with torch.no_grad():
+        model_g.dec.remove_weight_norm()
+
+    # old_forward = model_g.infer
+
+    def infer_forward(text, text_lengths, scales, sid=None):
+        noise_scale = scales[0]
+        length_scale = scales[1]
+        noise_scale_w = scales[2]
+        audio = model_g.infer(
+            text,
+            text_lengths,
+            noise_scale=noise_scale,
+            length_scale=length_scale,
+            noise_scale_w=noise_scale_w,
+            sid=sid,
+        )[0].unsqueeze(1)
+
+        return audio
+
+    model_g.forward = infer_forward
+
+    dummy_input_length = 50
+    sequences = torch.randint(
+        low=0, high=num_symbols, size=(1, dummy_input_length), dtype=torch.long
+    )
+    sequence_lengths = torch.LongTensor([sequences.size(1)])
+
+    sid: Optional[torch.LongTensor] = None
+    if num_speakers > 1:
+        sid = torch.LongTensor([0])
+
+    # noise, noise_w, length
+    scales = torch.FloatTensor([0.667, 1.0, 0.8])
+    dummy_input = (sequences, sequence_lengths, scales, sid)
+
+    # Export
+    torch.onnx.export(
+        model=model_g,
+        args=dummy_input,
+        f=str(args.output),
+        verbose=False,
+        opset_version=OPSET_VERSION,
+        input_names=["input", "input_lengths", "scales", "sid"],
+        output_names=["output"],
+        dynamic_axes={
+            "input": {0: "batch_size", 1: "phonemes"},
+            "input_lengths": {0: "batch_size"},
+            "output": {0: "batch_size", 1: "time"},
+        },
+    )
+
+    _LOGGER.info("Exported model to %s", args.output)
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/piper_train/export_onnx_streaming.py
+++ b/mlu_370-piper/piper/src/python/piper_train/export_onnx_streaming.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+
+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import Optional
+
+import torch
+from torch import nn
+
+from .vits import commons
+from .vits.lightning import VitsModel
+
+_LOGGER = logging.getLogger("piper_train.export_onnx")
+OPSET_VERSION = 15
+
+
+class VitsEncoder(nn.Module):
+    def __init__(self, gen):
+        super().__init__()
+        self.gen = gen
+
+    def forward(self, x, x_lengths, scales, sid=None):
+        noise_scale = scales[0]
+        length_scale = scales[1]
+        noise_scale_w = scales[2]
+
+        gen = self.gen
+        x, m_p, logs_p, x_mask = gen.enc_p(x, x_lengths)
+        if gen.n_speakers > 1:
+            assert sid is not None, "Missing speaker id"
+            g = gen.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
+        else:
+            g = None
+
+        if gen.use_sdp:
+            logw = gen.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
+        else:
+            logw = gen.dp(x, x_mask, g=g)
+        w = torch.exp(logw) * x_mask * length_scale
+        w_ceil = torch.ceil(w)
+        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+        y_mask = torch.unsqueeze(
+            commons.sequence_mask(y_lengths, y_lengths.max()), 1
+        ).type_as(x_mask)
+        attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
+        attn = commons.generate_path(w_ceil, attn_mask)
+
+        m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
+            1, 2
+        )  # [b, t', t], [b, t, d] -> [b, d, t']
+        logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
+            1, 2
+        )  # [b, t', t], [b, t, d] -> [b, d, t']
+
+        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
+        return z_p, y_mask, g
+
+
+class VitsDecoder(nn.Module):
+    def __init__(self, gen):
+        super().__init__()
+        self.gen = gen
+
+    def forward(self, z, y_mask, g=None):
+        z = self.gen.flow(z, y_mask, g=g, reverse=True)
+        output = self.gen.dec((z * y_mask), g=g)
+        return output
+
+
+def main() -> None:
+    """Main entry point"""
+    torch.manual_seed(1234)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint", help="Path to model checkpoint (.ckpt)")
+    parser.add_argument("output_dir", help="Path to output directory")
+
+    parser.add_argument(
+        "--debug", action="store_true", help="Print DEBUG messages to the console"
+    )
+    args = parser.parse_args()
+
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    _LOGGER.debug(args)
+
+    # -------------------------------------------------------------------------
+
+    args.checkpoint = Path(args.checkpoint)
+    args.output_dir = Path(args.output_dir)
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
+    model_g = model.model_g
+
+    with torch.no_grad():
+        model_g.dec.remove_weight_norm()
+
+    _LOGGER.info("Exporting encoder...")
+    decoder_input = export_encoder(args, model_g)
+    _LOGGER.info("Exporting decoder...")
+    export_decoder(args, model_g, decoder_input)
+    _LOGGER.info("Exported model to  %s", str(args.output_dir))
+
+
+def export_encoder(args, model_g):
+    model = VitsEncoder(model_g)
+    model.eval()
+
+    num_symbols = model_g.n_vocab
+    num_speakers = model_g.n_speakers
+
+    dummy_input_length = 50
+    sequences = torch.randint(
+        low=0, high=num_symbols, size=(1, dummy_input_length), dtype=torch.long
+    )
+    sequence_lengths = torch.LongTensor([sequences.size(1)])
+
+    sid: Optional[torch.LongTensor] = None
+    if num_speakers > 1:
+        sid = torch.LongTensor([0])
+
+    # noise, noise_w, length
+    scales = torch.FloatTensor([0.667, 1.0, 0.8])
+    dummy_input = (sequences, sequence_lengths, scales, sid)
+
+    output_names = [
+        "z",
+        "y_mask",
+    ]
+    if model_g.n_speakers > 1:
+        output_names.append("g")
+
+    onnx_path = os.fspath(args.output_dir.joinpath("encoder.onnx"))
+
+    # Export
+    torch.onnx.export(
+        model=model,
+        args=dummy_input,
+        f=onnx_path,
+        verbose=False,
+        opset_version=OPSET_VERSION,
+        input_names=["input", "input_lengths", "scales", "sid"],
+        output_names=output_names,
+        dynamic_axes={
+            "input": {0: "batch_size", 1: "phonemes"},
+            "input_lengths": {0: "batch_size"},
+            "output": {0: "batch_size", 2: "time"},
+        },
+    )
+    _LOGGER.info("Exported encoder to %s", onnx_path)
+
+    return model(*dummy_input)
+
+
+def export_decoder(args, model_g, decoder_input):
+    model = VitsDecoder(model_g)
+    model.eval()
+
+    input_names = [
+        "z",
+        "y_mask",
+    ]
+    if model_g.n_speakers > 1:
+        input_names.append("g")
+
+    onnx_path = os.fspath(args.output_dir.joinpath("decoder.onnx"))
+
+    # Export
+    torch.onnx.export(
+        model=model,
+        args=decoder_input,
+        f=onnx_path,
+        verbose=False,
+        opset_version=OPSET_VERSION,
+        input_names=input_names,
+        output_names=["output"],
+        dynamic_axes={
+            "z": {0: "batch_size", 2: "time"},
+            "y_mask": {0: "batch_size", 2: "time"},
+            "output": {0: "batch_size", 1: "time"},
+        },
+    )
+
+    _LOGGER.info("Exported decoder to %s", onnx_path)
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/piper_train/export_torchscript.py
+++ b/mlu_370-piper/piper/src/python/piper_train/export_torchscript.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+from pathlib import Path
+
+import torch
+
+from .vits.lightning import VitsModel
+
+_LOGGER = logging.getLogger("piper_train.export_torchscript")
+
+
+def main():
+    """Main entry point"""
+    torch.manual_seed(1234)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint", help="Path to model checkpoint (.ckpt)")
+    parser.add_argument("output", help="Path to output model (.onnx)")
+
+    parser.add_argument(
+        "--debug", action="store_true", help="Print DEBUG messages to the console"
+    )
+    args = parser.parse_args()
+
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    _LOGGER.debug(args)
+
+    # -------------------------------------------------------------------------
+
+    args.checkpoint = Path(args.checkpoint)
+    args.output = Path(args.output)
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+
+    model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
+    model_g = model.model_g
+
+    num_symbols = model_g.n_vocab
+
+    # Inference only
+    model_g.eval()
+
+    with torch.no_grad():
+        model_g.dec.remove_weight_norm()
+
+    model_g.forward = model_g.infer
+
+    dummy_input_length = 50
+    sequences = torch.randint(
+        low=0, high=num_symbols, size=(1, dummy_input_length), dtype=torch.long
+    )
+    sequence_lengths = torch.LongTensor([sequences.size(1)])
+
+    sid = torch.LongTensor([0])
+
+    dummy_input = (
+        sequences,
+        sequence_lengths,
+        sid,
+        torch.FloatTensor([0.667]),
+        torch.FloatTensor([1.0]),
+        torch.FloatTensor([0.8]),
+    )
+
+    jitted_model = torch.jit.trace(model_g, dummy_input)
+    torch.jit.save(jitted_model, str(args.output))
+
+    _LOGGER.info("Saved TorchScript model to %s", args.output)
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/piper_train/filter_utterances.py
+++ b/mlu_370-piper/piper/src/python/piper_train/filter_utterances.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import json
+import re
+import shutil
+import statistics
+import subprocess
+import sys
+import threading
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import asdict, dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+
+from .norm_audio import make_silence_detector, trim_silence
+
+_DIR = Path(__file__).parent
+
+# Removed from the speaking rate calculation
+_PUNCTUATION = re.compile(".。,，?¿？؟!！;；:：-—")
+
+
+class ExcludeReason(str, Enum):
+    MISSING = "file_missing"
+    EMPTY = "file_empty"
+    LOW = "rate_low"
+    HIGH = "rate_high"
+
+
+@dataclass
+class Utterance:
+    id: str
+    text: str
+    duration_sec: float
+    speaker: str
+    exclude_reason: Optional[ExcludeReason] = None
+    rate: float = 0.0
+
+    def __post_init__(self):
+        if self.duration_sec > 0:
+            # Don't include punctuation is speaking rate calculation since we
+            # remove silence.
+            text_nopunct = _PUNCTUATION.sub("", self.text)
+            self.rate = len(text_nopunct) / self.duration_sec
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--write-json", help="Path to write information about excluded utterances"
+    )
+    parser.add_argument(
+        "--dataset-dir", default=Path.cwd(), help="Path to dataset directory"
+    )
+    parser.add_argument("--scale-lower", type=float, default=2.0)
+    parser.add_argument("--scale-upper", type=float, default=2.0)
+    args = parser.parse_args()
+
+    if not shutil.which("ffprobe"):
+        raise RuntimeError("ffprobe not found (is ffmpeg installed?)")
+
+    dataset_dir = Path(args.dataset_dir)
+    wav_dir = dataset_dir / "wav"
+    if not wav_dir.is_dir():
+        wav_dir = dataset_dir / "wavs"
+
+    reader = csv.reader(sys.stdin, delimiter="|")
+
+    text_and_audio = []
+    for row in reader:
+        filename, text = row[0], row[-1]
+        speaker = row[1] if len(row) > 2 else "default"
+
+        # Try file name relative to metadata
+        wav_path = dataset_dir / filename
+
+        if not wav_path.exists():
+            # Try with .wav
+            wav_path = dataset_dir / f"{filename}.wav"
+
+        if not wav_path.exists():
+            # Try wav/ or wavs/
+            wav_path = wav_dir / filename
+
+        if not wav_path.exists():
+            # Try with .wav
+            wav_path = wav_dir / f"{filename}.wav"
+
+        text_and_audio.append((filename, text, wav_path, speaker))
+
+    writer = csv.writer(sys.stdout, delimiter="|")
+
+    # speaker -> [rate]
+    utts_by_speaker = defaultdict(list)
+    process_utterance = ProcessUtterance()
+    with ThreadPoolExecutor() as executor:
+        for utt in executor.map(lambda args: process_utterance(*args), text_and_audio):
+            utts_by_speaker[utt.speaker].append(utt)
+
+    is_multispeaker = len(utts_by_speaker) > 1
+    writer = csv.writer(sys.stdout, delimiter="|")
+
+    speaker_details = {}
+    for speaker, utts in utts_by_speaker.items():
+        rates = [utt.rate for utt in utts]
+        if rates:
+            # Exclude rates well outside the 25%/75% quantiles
+            rate_qs = statistics.quantiles(rates, n=4)
+            q1 = rate_qs[0]  # 25%
+            q3 = rate_qs[-1]  # 75%
+            iqr = q3 - q1
+            lower = q1 - (args.scale_lower * iqr)
+            upper = q3 + (args.scale_upper * iqr)
+            speaker_details[speaker] = {
+                "min": min(rates),
+                "max": max(rates),
+                "quanties": rate_qs,
+                "lower": lower,
+                "upper": upper,
+            }
+
+            for utt in utts:
+                if utt.rate < lower:
+                    utt.exclude_reason = ExcludeReason.LOW
+                elif utt.rate > upper:
+                    utt.exclude_reason = ExcludeReason.HIGH
+                else:
+                    if is_multispeaker:
+                        writer.writerow((utt.id, utt.speaker, utt.text))
+                    else:
+                        writer.writerow((utt.id, utt.text))
+
+    if args.write_json:
+        speaker_excluded = {
+            speaker: [
+                asdict(utt)
+                for utt in utts_by_speaker[speaker]
+                if utt.exclude_reason is not None
+            ]
+            for speaker in speaker_details
+        }
+
+        with open(args.write_json, "w", encoding="utf-8") as json_file:
+            json.dump(
+                {
+                    speaker: {
+                        "details": speaker_details[speaker],
+                        "num_utterances": len(utts_by_speaker[speaker]),
+                        "num_excluded": len(speaker_excluded[speaker]),
+                        "excluded": speaker_excluded[speaker],
+                    }
+                    for speaker in speaker_details
+                },
+                json_file,
+                indent=4,
+                ensure_ascii=False,
+            )
+
+
+class ProcessUtterance:
+    def __init__(self):
+        self.thread_data = threading.local()
+
+    def __call__(
+        self, utt_id: str, text: str, wav_path: Path, speaker: str
+    ) -> Utterance:
+        if not wav_path.exists():
+            return Utterance(
+                utt_id,
+                text,
+                0.0,
+                speaker,
+                exclude_reason=ExcludeReason.MISSING,
+            )
+
+        if wav_path.stat().st_size == 0:
+            return Utterance(
+                utt_id,
+                text,
+                0.0,
+                speaker,
+                exclude_reason=ExcludeReason.EMPTY,
+            )
+
+        return Utterance(utt_id, text, self.get_duration(wav_path), speaker)
+
+    def get_duration(self, audio_path: Path) -> float:
+        """Uses ffmpeg to get audio duration."""
+        if not hasattr(self.thread_data, "detector"):
+            self.thread_data.detector = make_silence_detector()
+
+        vad_sample_rate = 16000
+        audio_16khz_bytes = subprocess.check_output(
+            [
+                "ffmpeg",
+                "-i",
+                str(audio_path),
+                "-f",
+                "s16le",
+                "-acodec",
+                "pcm_s16le",
+                "-ac",
+                "1",
+                "-ar",
+                str(vad_sample_rate),
+                "pipe:",
+            ],
+            stderr=subprocess.DEVNULL,
+        )
+
+        # Normalize
+        audio_16khz = np.frombuffer(audio_16khz_bytes, dtype=np.int16).astype(
+            np.float32
+        )
+        audio_16khz /= np.abs(np.max(audio_16khz))
+
+        # Get speaking duration
+        offset_sec, duration_sec = trim_silence(
+            audio_16khz,
+            self.thread_data.detector,
+            threshold=0.8,
+            samples_per_chunk=480,
+            sample_rate=vad_sample_rate,
+            keep_chunks_before=2,
+            keep_chunks_after=2,
+        )
+
+        if duration_sec is None:
+            # Speech goes to end of audio
+            if len(audio_16khz) > 0:
+                duration_sec = (len(audio_16khz) / 16000.0) - offset_sec
+            else:
+                duration_sec = 0.0
+
+        return duration_sec
+
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/piper_train/infer.py
+++ b/mlu_370-piper/piper/src/python/piper_train/infer.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import logging
+import sys
+import time
+from pathlib import Path
+
+import torch
+
+from .vits.lightning import VitsModel
+from .vits.utils import audio_float_to_int16
+from .vits.wavfile import write as write_wav
+
+_LOGGER = logging.getLogger("piper_train.infer")
+
+
+def main():
+    """Main entry point"""
+    logging.basicConfig(level=logging.DEBUG)
+    parser = argparse.ArgumentParser(prog="piper_train.infer")
+    parser.add_argument(
+        "--checkpoint", required=True, help="Path to model checkpoint (.ckpt)"
+    )
+    parser.add_argument("--output-dir", required=True, help="Path to write WAV files")
+    parser.add_argument("--sample-rate", type=int, default=22050)
+    #
+    parser.add_argument("--noise-scale", type=float, default=0.667)
+    parser.add_argument("--length-scale", type=float, default=1.0)
+    parser.add_argument("--noise-w", type=float, default=0.8)
+    #
+    args = parser.parse_args()
+
+    args.output_dir = Path(args.output_dir)
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
+
+    # Inference only
+    model.eval()
+
+    with torch.no_grad():
+        model.model_g.dec.remove_weight_norm()
+
+    for i, line in enumerate(sys.stdin):
+        line = line.strip()
+        if not line:
+            continue
+
+        utt = json.loads(line)
+        utt_id = str(i)
+        phoneme_ids = utt["phoneme_ids"]
+        speaker_id = utt.get("speaker_id")
+
+        text = torch.LongTensor(phoneme_ids).unsqueeze(0)
+        text_lengths = torch.LongTensor([len(phoneme_ids)])
+        scales = [args.noise_scale, args.length_scale, args.noise_w]
+        sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
+
+        start_time = time.perf_counter()
+        audio = model(text, text_lengths, scales, sid=sid).detach().numpy()
+        audio = audio_float_to_int16(audio)
+        end_time = time.perf_counter()
+
+        audio_duration_sec = audio.shape[-1] / args.sample_rate
+        infer_sec = end_time - start_time
+        real_time_factor = (
+            infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
+        )
+
+        _LOGGER.debug(
+            "Real-time factor for %s: %0.2f (infer=%0.2f sec, audio=%0.2f sec)",
+            i + 1,
+            real_time_factor,
+            infer_sec,
+            audio_duration_sec,
+        )
+
+        output_path = args.output_dir / f"{utt_id}.wav"
+        write_wav(str(output_path), args.sample_rate, audio)
+
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/piper_train/infer_generator.py
+++ b/mlu_370-piper/piper/src/python/piper_train/infer_generator.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import logging
+import sys
+import time
+from pathlib import Path
+
+import torch
+
+from .vits.utils import audio_float_to_int16
+from .vits.wavfile import write as write_wav
+
+_LOGGER = logging.getLogger("piper_train.infer_generator")
+
+
+def main():
+    """Main entry point"""
+    logging.basicConfig(level=logging.DEBUG)
+    parser = argparse.ArgumentParser(prog="piper_train.infer_generator")
+    parser.add_argument("--model", required=True, help="Path to generator (.pt)")
+    parser.add_argument("--output-dir", required=True, help="Path to write WAV files")
+    parser.add_argument("--sample-rate", type=int, default=22050)
+    args = parser.parse_args()
+
+    args.output_dir = Path(args.output_dir)
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    model = torch.load(args.model)
+
+    # Inference only
+    model.eval()
+
+    for i, line in enumerate(sys.stdin):
+        line = line.strip()
+        if not line:
+            continue
+
+        utt = json.loads(line)
+        utt_id = str(i)
+        phoneme_ids = utt["phoneme_ids"]
+        speaker_id = utt.get("speaker_id")
+
+        text = torch.LongTensor(phoneme_ids).unsqueeze(0)
+        text_lengths = torch.LongTensor([len(phoneme_ids)])
+        sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
+
+        start_time = time.perf_counter()
+        audio = (
+            model(
+                text,
+                text_lengths,
+                sid,
+                # torch.FloatTensor([0.667]),
+                # torch.FloatTensor([1.0]),
+                # torch.FloatTensor([0.8]),
+            )[0]
+            .detach()
+            .numpy()
+        )
+        audio = audio_float_to_int16(audio)
+        end_time = time.perf_counter()
+
+        audio_duration_sec = audio.shape[-1] / args.sample_rate
+        infer_sec = end_time - start_time
+        real_time_factor = (
+            infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
+        )
+
+        _LOGGER.debug(
+            "Real-time factor for %s: %0.2f (infer=%0.2f sec, audio=%0.2f sec)",
+            i + 1,
+            real_time_factor,
+            infer_sec,
+            audio_duration_sec,
+        )
+
+        output_path = args.output_dir / f"{utt_id}.wav"
+        write_wav(str(output_path), args.sample_rate, audio)
+
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/piper_train/infer_onnx.py
+++ b/mlu_370-piper/piper/src/python/piper_train/infer_onnx.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import logging
+import math
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+import onnxruntime
+
+from .vits.utils import audio_float_to_int16
+from .vits.wavfile import write as write_wav
+
+_LOGGER = logging.getLogger("piper_train.infer_onnx")
+
+
+def main():
+    """Main entry point"""
+    logging.basicConfig(level=logging.DEBUG)
+    parser = argparse.ArgumentParser(prog="piper_train.infer_onnx")
+    parser.add_argument("--model", required=True, help="Path to model (.onnx)")
+    parser.add_argument("--output-dir", required=True, help="Path to write WAV files")
+    parser.add_argument("--sample-rate", type=int, default=22050)
+    parser.add_argument("--noise-scale", type=float, default=0.667)
+    parser.add_argument("--noise-scale-w", type=float, default=0.8)
+    parser.add_argument("--length-scale", type=float, default=1.0)
+    args = parser.parse_args()
+
+    args.output_dir = Path(args.output_dir)
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    sess_options = onnxruntime.SessionOptions()
+    _LOGGER.debug("Loading model from %s", args.model)
+    model = onnxruntime.InferenceSession(str(args.model), sess_options=sess_options)
+    _LOGGER.info("Loaded model from %s", args.model)
+
+    # text_empty = np.zeros((1, 300), dtype=np.int64)
+    # text_lengths_empty = np.array([text_empty.shape[1]], dtype=np.int64)
+    # scales = np.array(
+    #     [args.noise_scale, args.length_scale, args.noise_scale_w],
+    #     dtype=np.float32,
+    # )
+    # bias_audio = model.run(
+    #     None,
+    #     {"input": text_empty, "input_lengths": text_lengths_empty, "scales": scales},
+    # )[0].squeeze((0, 1))
+    # bias_spec, _ = transform(bias_audio)
+
+    for i, line in enumerate(sys.stdin):
+        line = line.strip()
+        if not line:
+            continue
+
+        utt = json.loads(line)
+        # utt_id = utt["id"]
+        utt_id = str(i)
+        phoneme_ids = utt["phoneme_ids"]
+        speaker_id = utt.get("speaker_id")
+
+        text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
+        text_lengths = np.array([text.shape[1]], dtype=np.int64)
+        scales = np.array(
+            [args.noise_scale, args.length_scale, args.noise_scale_w],
+            dtype=np.float32,
+        )
+        sid = None
+
+        if speaker_id is not None:
+            sid = np.array([speaker_id], dtype=np.int64)
+
+        start_time = time.perf_counter()
+        audio = model.run(
+            None,
+            {
+                "input": text,
+                "input_lengths": text_lengths,
+                "scales": scales,
+                "sid": sid,
+            },
+        )[0].squeeze((0, 1))
+        # audio = denoise(audio, bias_spec, 10)
+        audio = audio_float_to_int16(audio.squeeze())
+        end_time = time.perf_counter()
+
+        audio_duration_sec = audio.shape[-1] / args.sample_rate
+        infer_sec = end_time - start_time
+        real_time_factor = (
+            infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
+        )
+
+        _LOGGER.debug(
+            "Real-time factor for %s: %0.2f (infer=%0.2f sec, audio=%0.2f sec)",
+            i + 1,
+            real_time_factor,
+            infer_sec,
+            audio_duration_sec,
+        )
+
+        output_path = args.output_dir / f"{utt_id}.wav"
+        write_wav(str(output_path), args.sample_rate, audio)
+
+
+def denoise(
+    audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float
+) -> np.ndarray:
+    audio_spec, audio_angles = transform(audio)
+
+    a = bias_spec.shape[-1]
+    b = audio_spec.shape[-1]
+    repeats = max(1, math.ceil(b / a))
+    bias_spec_repeat = np.repeat(bias_spec, repeats, axis=-1)[..., :b]
+
+    audio_spec_denoised = audio_spec - (bias_spec_repeat * denoiser_strength)
+    audio_spec_denoised = np.clip(audio_spec_denoised, a_min=0.0, a_max=None)
+    audio_denoised = inverse(audio_spec_denoised, audio_angles)
+
+    return audio_denoised
+
+
+def stft(x, fft_size, hopsamp):
+    """Compute and return the STFT of the supplied time domain signal x.
+    Args:
+        x (1-dim Numpy array): A time domain signal.
+        fft_size (int): FFT size. Should be a power of 2, otherwise DFT will be used.
+        hopsamp (int):
+    Returns:
+        The STFT. The rows are the time slices and columns are the frequency bins.
+    """
+    window = np.hanning(fft_size)
+    fft_size = int(fft_size)
+    hopsamp = int(hopsamp)
+    return np.array(
+        [
+            np.fft.rfft(window * x[i : i + fft_size])
+            for i in range(0, len(x) - fft_size, hopsamp)
+        ]
+    )
+
+
+def istft(X, fft_size, hopsamp):
+    """Invert a STFT into a time domain signal.
+    Args:
+        X (2-dim Numpy array): Input spectrogram. The rows are the time slices and columns are the frequency bins.
+        fft_size (int):
+        hopsamp (int): The hop size, in samples.
+    Returns:
+        The inverse STFT.
+    """
+    fft_size = int(fft_size)
+    hopsamp = int(hopsamp)
+    window = np.hanning(fft_size)
+    time_slices = X.shape[0]
+    len_samples = int(time_slices * hopsamp + fft_size)
+    x = np.zeros(len_samples)
+    for n, i in enumerate(range(0, len(x) - fft_size, hopsamp)):
+        x[i : i + fft_size] += window * np.real(np.fft.irfft(X[n]))
+    return x
+
+
+def inverse(magnitude, phase):
+    recombine_magnitude_phase = np.concatenate(
+        [magnitude * np.cos(phase), magnitude * np.sin(phase)], axis=1
+    )
+
+    x_org = recombine_magnitude_phase
+    n_b, n_f, n_t = x_org.shape  # pylint: disable=unpacking-non-sequence
+    x = np.empty([n_b, n_f // 2, n_t], dtype=np.complex64)
+    x.real = x_org[:, : n_f // 2]
+    x.imag = x_org[:, n_f // 2 :]
+    inverse_transform = []
+    for y in x:
+        y_ = istft(y.T, fft_size=1024, hopsamp=256)
+        inverse_transform.append(y_[None, :])
+
+    inverse_transform = np.concatenate(inverse_transform, 0)
+
+    return inverse_transform
+
+
+def transform(input_data):
+    x = input_data
+    real_part = []
+    imag_part = []
+    for y in x:
+        y_ = stft(y, fft_size=1024, hopsamp=256).T
+        real_part.append(y_.real[None, :, :])  # pylint: disable=unsubscriptable-object
+        imag_part.append(y_.imag[None, :, :])  # pylint: disable=unsubscriptable-object
+    real_part = np.concatenate(real_part, 0)
+    imag_part = np.concatenate(imag_part, 0)
+
+    magnitude = np.sqrt(real_part**2 + imag_part**2)
+    phase = np.arctan2(imag_part.data, real_part.data)
+
+    return magnitude, phase
+
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/piper_train/infer_onnx_streaming.py
+++ b/mlu_370-piper/piper/src/python/piper_train/infer_onnx_streaming.py
@@ -0,0 +1,295 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import logging
+import math
+import os
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+import onnxruntime
+
+from .vits.utils import audio_float_to_int16
+
+_LOGGER = logging.getLogger("piper_train.infer_onnx")
+
+
+class SpeechStreamer:
+    """
+    Stream speech in real time.
+
+    Args:
+        encoder_path: path to encoder ONNX model
+        decoder_path: path to decoder ONNX model
+        sample_rate: output sample rate
+        chunk_size: number of mel frames to decode in each steps (time in secs = chunk_size * 256)
+        chunk_padding: number of mel frames to be concatinated to the start and end of the current chunk to reduce decoding artifacts
+    """
+
+    def __init__(
+        self,
+        encoder_path,
+        decoder_path,
+        sample_rate,
+        chunk_size=45,
+        chunk_padding=10,
+    ):
+        sess_options = onnxruntime.SessionOptions()
+        _LOGGER.debug("Loading encoder model from %s", encoder_path)
+        self.encoder = onnxruntime.InferenceSession(
+            encoder_path, sess_options=sess_options
+        )
+        _LOGGER.debug("Loading decoder model from %s", decoder_path)
+        self.decoder = onnxruntime.InferenceSession(
+            decoder_path, sess_options=sess_options
+        )
+
+        self.sample_rate = sample_rate
+        self.chunk_size = chunk_size
+        self.chunk_padding = chunk_padding
+
+    def encoder_infer(self, enc_input):
+        ENC_START = time.perf_counter()
+        enc_output = self.encoder.run(None, enc_input)
+        ENC_INFER = time.perf_counter() - ENC_START
+        _LOGGER.debug(f"Encoder inference {round(ENC_INFER * 1000)}")
+        wav_length = enc_output[0].shape[2] * 256
+        enc_rtf = round(ENC_INFER / (wav_length / self.sample_rate), 2)
+        _LOGGER.debug(f"Encoder RTF {enc_rtf}")
+        return enc_output
+
+    def decoder_infer(self, z, y_mask, g=None):
+        dec_input = {"z": z, "y_mask": y_mask}
+        if g:
+            dec_input["g"] = g
+        DEC_START = time.perf_counter()
+        audio = self.decoder.run(None, dec_input)[0].squeeze()
+        DEC_INFER = time.perf_counter() - DEC_START
+        _LOGGER.debug(f"Decoder inference {round(DEC_INFER * 1000)}")
+        dec_rtf = round(DEC_INFER / (len(audio) / self.sample_rate), 2)
+        _LOGGER.debug(f"Decoder RTF {dec_rtf}")
+        return audio
+
+    def chunk(self, enc_output):
+        z, y_mask, *dec_args = enc_output
+        n_frames = z.shape[2]
+        if n_frames <= (self.chunk_size + (2 * self.chunk_padding)):
+            # Too short to stream
+            return self.decoder_infer(z, y_mask, *dec_args)
+        split_at = [
+            i * self.chunk_size for i in range(1, math.ceil(n_frames / self.chunk_size))
+        ]
+        chunks = list(
+            zip(
+                np.split(z, split_at, axis=2),
+                np.split(y_mask, split_at, axis=2),
+            )
+        )
+        wav_start_pad = wav_end_pad = None
+        for idx, (z_chunk, y_mask_chunk) in enumerate(chunks):
+            if idx > 0:
+                prev_z, prev_y_mask = chunks[idx - 1]
+                start_zpad = prev_z[:, :, -self.chunk_padding :]
+                start_ypad = prev_y_mask[:, :, -self.chunk_padding :]
+                z_chunk = np.concatenate([start_zpad, z_chunk], axis=2)
+                y_mask_chunk = np.concatenate([start_ypad, y_mask_chunk], axis=2)
+                wav_start_pad = start_zpad.shape[2] * 256
+            if (idx + 1) < len(chunks):
+                next_z, next_y_mask = chunks[idx + 1]
+                end_zpad = next_z[:, :, : self.chunk_padding]
+                end_ypad = next_y_mask[:, :, : self.chunk_padding]
+                z_chunk = np.concatenate([z_chunk, end_zpad], axis=2)
+                y_mask_chunk = np.concatenate([y_mask_chunk, end_ypad], axis=2)
+                wav_end_pad = end_zpad.shape[2] * 256
+            audio = self.decoder_infer(z_chunk, y_mask_chunk, *dec_args)
+            yield audio[wav_start_pad:-wav_end_pad]
+
+    def stream(self, encoder_input):
+        start_time = time.perf_counter()
+        has_shown_latency = False
+        _LOGGER.debug("Starting synthesis")
+        enc_output = self.encoder_infer(encoder_input)
+        for wav in self.chunk(enc_output):
+            if len(wav) == 0:
+                continue
+            if not has_shown_latency:
+                LATENCY = round((time.perf_counter() - start_time) * 1000)
+                _LOGGER.debug(f"Latency {LATENCY}")
+                has_shown_latency = True
+            audio = audio_float_to_int16(wav)
+            yield audio.tobytes()
+        _LOGGER.debug("Synthesis done!")
+
+
+def main():
+    """Main entry point"""
+    logging.basicConfig(level=logging.DEBUG)
+    parser = argparse.ArgumentParser(prog="piper_train.infer_onnx_streaming")
+    parser.add_argument(
+        "--encoder", required=True, help="Path to encoder model (.onnx)"
+    )
+    parser.add_argument(
+        "--decoder", required=True, help="Path to decoder  model (.onnx)"
+    )
+    parser.add_argument("--sample-rate", type=int, default=22050)
+    parser.add_argument("--noise-scale", type=float, default=0.667)
+    parser.add_argument("--noise-scale-w", type=float, default=0.8)
+    parser.add_argument("--length-scale", type=float, default=1.0)
+    parser.add_argument(
+        "--chunk-size",
+        type=int,
+        default=45,
+        help="Number of mel frames to decode at each step"
+    )
+    parser.add_argument(
+        "--chunk-padding",
+        type=int,
+        default=5,
+        help="Number of mel frames to add to the start and end of the current chunk to reduce decoding artifacts"
+    )
+
+    args = parser.parse_args()
+
+    streamer = SpeechStreamer(
+        encoder_path=os.fspath(args.encoder),
+        decoder_path=os.fspath(args.decoder),
+        sample_rate=args.sample_rate,
+        chunk_size=args.chunk_size,
+        chunk_padding=args.chunk_padding,
+    )
+
+    output_buffer = sys.stdout.buffer
+
+    for i, line in enumerate(sys.stdin):
+        line = line.strip()
+        if not line:
+            continue
+
+        utt = json.loads(line)
+        utt_id = str(i)
+        phoneme_ids = utt["phoneme_ids"]
+        speaker_id = utt.get("speaker_id")
+
+        text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
+        text_lengths = np.array([text.shape[1]], dtype=np.int64)
+        scales = np.array(
+            [args.noise_scale, args.length_scale, args.noise_scale_w],
+            dtype=np.float32,
+        )
+        sid = None
+
+        if speaker_id is not None:
+            sid = np.array([speaker_id], dtype=np.int64)
+
+        stream = streamer.stream(
+            {
+                "input": text,
+                "input_lengths": text_lengths,
+                "scales": scales,
+                "sid": sid,
+            }
+        )
+        for wav_chunk in stream:
+            output_buffer.write(wav_chunk)
+            output_buffer.flush()
+
+
+def denoise(
+    audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float
+) -> np.ndarray:
+    audio_spec, audio_angles = transform(audio)
+
+    a = bias_spec.shape[-1]
+    b = audio_spec.shape[-1]
+    repeats = max(1, math.ceil(b / a))
+    bias_spec_repeat = np.repeat(bias_spec, repeats, axis=-1)[..., :b]
+
+    audio_spec_denoised = audio_spec - (bias_spec_repeat * denoiser_strength)
+    audio_spec_denoised = np.clip(audio_spec_denoised, a_min=0.0, a_max=None)
+    audio_denoised = inverse(audio_spec_denoised, audio_angles)
+
+    return audio_denoised
+
+
+def stft(x, fft_size, hopsamp):
+    """Compute and return the STFT of the supplied time domain signal x.
+    Args:
+        x (1-dim Numpy array): A time domain signal.
+        fft_size (int): FFT size. Should be a power of 2, otherwise DFT will be used.
+        hopsamp (int):
+    Returns:
+        The STFT. The rows are the time slices and columns are the frequency bins.
+    """
+    window = np.hanning(fft_size)
+    fft_size = int(fft_size)
+    hopsamp = int(hopsamp)
+    return np.array(
+        [
+            np.fft.rfft(window * x[i : i + fft_size])
+            for i in range(0, len(x) - fft_size, hopsamp)
+        ]
+    )
+
+
+def istft(X, fft_size, hopsamp):
+    """Invert a STFT into a time domain signal.
+    Args:
+        X (2-dim Numpy array): Input spectrogram. The rows are the time slices and columns are the frequency bins.
+        fft_size (int):
+        hopsamp (int): The hop size, in samples.
+    Returns:
+        The inverse STFT.
+    """
+    fft_size = int(fft_size)
+    hopsamp = int(hopsamp)
+    window = np.hanning(fft_size)
+    time_slices = X.shape[0]
+    len_samples = int(time_slices * hopsamp + fft_size)
+    x = np.zeros(len_samples)
+    for n, i in enumerate(range(0, len(x) - fft_size, hopsamp)):
+        x[i : i + fft_size] += window * np.real(np.fft.irfft(X[n]))
+    return x
+
+
+def inverse(magnitude, phase):
+    recombine_magnitude_phase = np.concatenate(
+        [magnitude * np.cos(phase), magnitude * np.sin(phase)], axis=1
+    )
+
+    x_org = recombine_magnitude_phase
+    n_b, n_f, n_t = x_org.shape  # pylint: disable=unpacking-non-sequence
+    x = np.empty([n_b, n_f // 2, n_t], dtype=np.complex64)
+    x.real = x_org[:, : n_f // 2]
+    x.imag = x_org[:, n_f // 2 :]
+    inverse_transform = []
+    for y in x:
+        y_ = istft(y.T, fft_size=1024, hopsamp=256)
+        inverse_transform.append(y_[None, :])
+
+    inverse_transform = np.concatenate(inverse_transform, 0)
+
+    return inverse_transform
+
+
+def transform(input_data):
+    x = input_data
+    real_part = []
+    imag_part = []
+    for y in x:
+        y_ = stft(y, fft_size=1024, hopsamp=256).T
+        real_part.append(y_.real[None, :, :])  # pylint: disable=unsubscriptable-object
+        imag_part.append(y_.imag[None, :, :])  # pylint: disable=unsubscriptable-object
+    real_part = np.concatenate(real_part, 0)
+    imag_part = np.concatenate(imag_part, 0)
+
+    magnitude = np.sqrt(real_part**2 + imag_part**2)
+    phase = np.arctan2(imag_part.data, real_part.data)
+
+    return magnitude, phase
+
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/piper_train/infer_torchscript.py
+++ b/mlu_370-piper/piper/src/python/piper_train/infer_torchscript.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import logging
+import sys
+import time
+from pathlib import Path
+
+import torch
+
+from .vits.utils import audio_float_to_int16
+from .vits.wavfile import write as write_wav
+
+_LOGGER = logging.getLogger("piper_train.infer_torchscript")
+
+
+def main():
+    """Main entry point"""
+    logging.basicConfig(level=logging.DEBUG)
+    parser = argparse.ArgumentParser(prog="piper_train.infer_torchscript")
+    parser.add_argument(
+        "--model", required=True, help="Path to torchscript checkpoint (.ts)"
+    )
+    parser.add_argument("--output-dir", required=True, help="Path to write WAV files")
+    parser.add_argument("--sample-rate", type=int, default=22050)
+    args = parser.parse_args()
+
+    args.output_dir = Path(args.output_dir)
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    model = torch.jit.load(args.model)
+
+    # Inference only
+    model.eval()
+
+    for i, line in enumerate(sys.stdin):
+        line = line.strip()
+        if not line:
+            continue
+
+        utt = json.loads(line)
+        utt_id = str(i)
+        phoneme_ids = utt["phoneme_ids"]
+        speaker_id = utt.get("speaker_id")
+
+        text = torch.LongTensor(phoneme_ids).unsqueeze(0)
+        text_lengths = torch.LongTensor([len(phoneme_ids)])
+        sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None
+
+        start_time = time.perf_counter()
+        audio = (
+            model(
+                text,
+                text_lengths,
+                sid,
+                torch.FloatTensor([0.667]),
+                torch.FloatTensor([1.0]),
+                torch.FloatTensor([0.8]),
+            )[0]
+            .detach()
+            .numpy()
+        )
+        audio = audio_float_to_int16(audio)
+        end_time = time.perf_counter()
+
+        audio_duration_sec = audio.shape[-1] / args.sample_rate
+        infer_sec = end_time - start_time
+        real_time_factor = (
+            infer_sec / audio_duration_sec if audio_duration_sec > 0 else 0.0
+        )
+
+        _LOGGER.debug(
+            "Real-time factor for %s: %0.2f (infer=%0.2f sec, audio=%0.2f sec)",
+            i + 1,
+            real_time_factor,
+            infer_sec,
+            audio_duration_sec,
+        )
+
+        output_path = args.output_dir / f"{utt_id}.wav"
+        write_wav(str(output_path), args.sample_rate, audio)
+
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/piper_train/norm_audio/init.py
+++ b/mlu_370-piper/piper/src/python/piper_train/norm_audio/init.py
@@ -0,0 +1,92 @@
+from hashlib import sha256
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+import librosa
+import torch
+
+from piper_train.vits.mel_processing import spectrogram_torch
+
+from .trim import trim_silence
+from .vad import SileroVoiceActivityDetector
+
+_DIR = Path(__file__).parent
+
+
+def make_silence_detector() -> SileroVoiceActivityDetector:
+    silence_model = _DIR / "models" / "silero_vad.onnx"
+    return SileroVoiceActivityDetector(silence_model)
+
+
+def cache_norm_audio(
+    audio_path: Union[str, Path],
+    cache_dir: Union[str, Path],
+    detector: SileroVoiceActivityDetector,
+    sample_rate: int,
+    silence_threshold: float = 0.2,
+    silence_samples_per_chunk: int = 480,
+    silence_keep_chunks_before: int = 2,
+    silence_keep_chunks_after: int = 2,
+    filter_length: int = 1024,
+    window_length: int = 1024,
+    hop_length: int = 256,
+    ignore_cache: bool = False,
+) -> Tuple[Path, Path]:
+    audio_path = Path(audio_path).absolute()
+    cache_dir = Path(cache_dir)
+
+    # Cache id is the SHA256 of the full audio path
+    audio_cache_id = sha256(str(audio_path).encode()).hexdigest()
+
+    audio_norm_path = cache_dir / f"{audio_cache_id}.pt"
+    audio_spec_path = cache_dir / f"{audio_cache_id}.spec.pt"
+
+    # Normalize audio
+    audio_norm_tensor: Optional[torch.FloatTensor] = None
+    if ignore_cache or (not audio_norm_path.exists()):
+        # Trim silence first.
+        #
+        # The VAD model works on 16khz, so we determine the portion of audio
+        # to keep and then just load that with librosa.
+        vad_sample_rate = 16000
+        audio_16khz, _sr = librosa.load(path=audio_path, sr=vad_sample_rate)
+
+        offset_sec, duration_sec = trim_silence(
+            audio_16khz,
+            detector,
+            threshold=silence_threshold,
+            samples_per_chunk=silence_samples_per_chunk,
+            sample_rate=vad_sample_rate,
+            keep_chunks_before=silence_keep_chunks_before,
+            keep_chunks_after=silence_keep_chunks_after,
+        )
+
+        # NOTE: audio is already in [-1, 1] coming from librosa
+        audio_norm_array, _sr = librosa.load(
+            path=audio_path,
+            sr=sample_rate,
+            offset=offset_sec,
+            duration=duration_sec,
+        )
+
+        # Save to cache directory
+        audio_norm_tensor = torch.FloatTensor(audio_norm_array).unsqueeze(0)
+        torch.save(audio_norm_tensor, audio_norm_path)
+
+    # Compute spectrogram
+    if ignore_cache or (not audio_spec_path.exists()):
+        if audio_norm_tensor is None:
+            # Load pre-cached normalized audio
+            audio_norm_tensor = torch.load(audio_norm_path)
+
+        audio_spec_tensor = spectrogram_torch(
+            y=audio_norm_tensor,
+            n_fft=filter_length,
+            sampling_rate=sample_rate,
+            hop_size=hop_length,
+            win_size=window_length,
+            center=False,
+        ).squeeze(0)
+        torch.save(audio_spec_tensor, audio_spec_path)
+
+    return audio_norm_path, audio_spec_path
--- a/mlu_370-piper/piper/src/python/piper_train/norm_audio/models/silero_vad.onnx
+++ b/mlu_370-piper/piper/src/python/piper_train/norm_audio/models/silero_vad.onnx
--- a/mlu_370-piper/piper/src/python/piper_train/norm_audio/trim.py
+++ b/mlu_370-piper/piper/src/python/piper_train/norm_audio/trim.py
@@ -0,0 +1,54 @@
+from typing import Optional, Tuple
+
+import numpy as np
+
+from .vad import SileroVoiceActivityDetector
+
+
+def trim_silence(
+    audio_array: np.ndarray,
+    detector: SileroVoiceActivityDetector,
+    threshold: float = 0.2,
+    samples_per_chunk=480,
+    sample_rate=16000,
+    keep_chunks_before: int = 2,
+    keep_chunks_after: int = 2,
+) -> Tuple[float, Optional[float]]:
+    """Returns the offset/duration of trimmed audio in seconds"""
+    offset_sec: float = 0.0
+    duration_sec: Optional[float] = None
+    first_chunk: Optional[int] = None
+    last_chunk: Optional[int] = None
+    seconds_per_chunk: float = samples_per_chunk / sample_rate
+
+    chunk = audio_array[:samples_per_chunk]
+    audio_array = audio_array[samples_per_chunk:]
+    chunk_idx: int = 0
+
+    # Determine main block of speech
+    while len(audio_array) > 0:
+        prob = detector(chunk, sample_rate=sample_rate)
+        is_speech = prob >= threshold
+
+        if is_speech:
+            if first_chunk is None:
+                # First speech
+                first_chunk = chunk_idx
+            else:
+                # Last speech so far
+                last_chunk = chunk_idx
+
+        chunk = audio_array[:samples_per_chunk]
+        audio_array = audio_array[samples_per_chunk:]
+        chunk_idx += 1
+
+    if (first_chunk is not None) and (last_chunk is not None):
+        first_chunk = max(0, first_chunk - keep_chunks_before)
+        last_chunk = min(chunk_idx, last_chunk + keep_chunks_after)
+
+        # Compute offset/duration
+        offset_sec = first_chunk * seconds_per_chunk
+        last_sec = (last_chunk + 1) * seconds_per_chunk
+        duration_sec = last_sec - offset_sec
+
+    return offset_sec, duration_sec
--- a/mlu_370-piper/piper/src/python/piper_train/norm_audio/vad.py
+++ b/mlu_370-piper/piper/src/python/piper_train/norm_audio/vad.py
@@ -0,0 +1,54 @@
+import typing
+from pathlib import Path
+
+import numpy as np
+import onnxruntime
+
+
+class SileroVoiceActivityDetector:
+    """Detects speech/silence using Silero VAD.
+
+    https://github.com/snakers4/silero-vad
+    """
+
+    def __init__(self, onnx_path: typing.Union[str, Path]):
+        onnx_path = str(onnx_path)
+
+        self.session = onnxruntime.InferenceSession(onnx_path)
+        self.session.intra_op_num_threads = 1
+        self.session.inter_op_num_threads = 1
+
+        self._h = np.zeros((2, 1, 64)).astype("float32")
+        self._c = np.zeros((2, 1, 64)).astype("float32")
+
+    def __call__(self, audio_array: np.ndarray, sample_rate: int = 16000):
+        """Return probability of speech in audio [0-1].
+
+        Audio must be 16Khz 16-bit mono PCM.
+        """
+        if len(audio_array.shape) == 1:
+            # Add batch dimension
+            audio_array = np.expand_dims(audio_array, 0)
+
+        if len(audio_array.shape) > 2:
+            raise ValueError(
+                f"Too many dimensions for input audio chunk {audio_array.shape}"
+            )
+
+        if audio_array.shape[0] > 1:
+            raise ValueError("Onnx model does not support batching")
+
+        if sample_rate != 16000:
+            raise ValueError("Only 16Khz audio is supported")
+
+        ort_inputs = {
+            "input": audio_array.astype(np.float32),
+            "h0": self._h,
+            "c0": self._c,
+        }
+        ort_outs = self.session.run(None, ort_inputs)
+        out, self._h, self._c = ort_outs
+
+        out = out.squeeze(2)[:, 1]  # make output type match JIT analog
+
+        return out
--- a/mlu_370-piper/piper/src/python/piper_train/preprocess.py
+++ b/mlu_370-piper/piper/src/python/piper_train/preprocess.py
@@ -0,0 +1,502 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import dataclasses
+import itertools
+import json
+import logging
+import os
+import unicodedata
+from collections import Counter
+from dataclasses import dataclass, field
+from enum import Enum
+from multiprocessing import JoinableQueue, Process, Queue
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional
+
+from piper_phonemize import (
+    phonemize_espeak,
+    phonemize_codepoints,
+    phoneme_ids_espeak,
+    phoneme_ids_codepoints,
+    get_codepoints_map,
+    get_espeak_map,
+    get_max_phonemes,
+    tashkeel_run,
+)
+
+from .norm_audio import cache_norm_audio, make_silence_detector
+
+_DIR = Path(__file__).parent
+_VERSION = (_DIR / "VERSION").read_text(encoding="utf-8").strip()
+_LOGGER = logging.getLogger("preprocess")
+
+
+class PhonemeType(str, Enum):
+    ESPEAK = "espeak"
+    """Phonemes come from espeak-ng"""
+
+    TEXT = "text"
+    """Phonemes come from text itself"""
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input-dir", required=True, help="Directory with audio dataset"
+    )
+    parser.add_argument(
+        "--output-dir",
+        required=True,
+        help="Directory to write output files for training",
+    )
+    parser.add_argument("--language", required=True, help="eSpeak-ng voice")
+    parser.add_argument(
+        "--sample-rate",
+        type=int,
+        required=True,
+        help="Target sample rate for voice (hertz)",
+    )
+    parser.add_argument(
+        "--dataset-format", choices=("ljspeech", "mycroft"), required=True
+    )
+    parser.add_argument("--cache-dir", help="Directory to cache processed audio files")
+    parser.add_argument("--max-workers", type=int)
+    parser.add_argument(
+        "--single-speaker", action="store_true", help="Force single speaker dataset"
+    )
+    parser.add_argument(
+        "--speaker-id", type=int, help="Add speaker id to single speaker dataset"
+    )
+    #
+    parser.add_argument(
+        "--phoneme-type",
+        choices=list(PhonemeType),
+        default=PhonemeType.ESPEAK,
+        help="Type of phonemes to use (default: espeak)",
+    )
+    parser.add_argument(
+        "--text-casing",
+        choices=("ignore", "lower", "upper", "casefold"),
+        default="ignore",
+        help="Casing applied to utterance text",
+    )
+    #
+    parser.add_argument(
+        "--dataset-name",
+        help="Name of dataset to put in config (default: name of <ouput_dir>/../)",
+    )
+    parser.add_argument(
+        "--audio-quality",
+        help="Audio quality to put in config (default: name of <output_dir>)",
+    )
+    #
+    parser.add_argument(
+        "--tashkeel",
+        action="store_true",
+        help="Diacritize Arabic text with libtashkeel",
+    )
+    #
+    parser.add_argument(
+        "--skip-audio", action="store_true", help="Don't preprocess audio"
+    )
+    parser.add_argument(
+        "--debug", action="store_true", help="Print DEBUG messages to the console"
+    )
+    args = parser.parse_args()
+
+    if args.single_speaker and (args.speaker_id is not None):
+        _LOGGER.fatal("--single-speaker and --speaker-id cannot both be provided")
+        return
+
+    level = logging.DEBUG if args.debug else logging.INFO
+    logging.basicConfig(level=level)
+    logging.getLogger().setLevel(level)
+
+    # Prevent log spam
+    logging.getLogger("numba").setLevel(logging.WARNING)
+
+    # Ensure enum
+    args.phoneme_type = PhonemeType(args.phoneme_type)
+
+    # Convert to paths and create output directories
+    args.input_dir = Path(args.input_dir)
+    args.output_dir = Path(args.output_dir)
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    args.cache_dir = (
+        Path(args.cache_dir)
+        if args.cache_dir
+        else args.output_dir / "cache" / str(args.sample_rate)
+    )
+    args.cache_dir.mkdir(parents=True, exist_ok=True)
+
+    if args.dataset_format == "mycroft":
+        make_dataset = mycroft_dataset
+    else:
+        make_dataset = ljspeech_dataset
+
+    # Count speakers
+    _LOGGER.debug("Counting number of speakers/utterances in the dataset")
+    speaker_counts: "Counter[str]" = Counter()
+    num_utterances = 0
+    for utt in make_dataset(args):
+        speaker = utt.speaker or ""
+        speaker_counts[speaker] += 1
+        num_utterances += 1
+
+    assert num_utterances > 0, "No utterances found"
+
+    is_multispeaker = len(speaker_counts) > 1
+    speaker_ids: Dict[str, int] = {}
+
+    if is_multispeaker:
+        _LOGGER.info("%s speakers detected", len(speaker_counts))
+
+        # Assign speaker ids by most number of utterances first
+        for speaker_id, (speaker, _speaker_count) in enumerate(
+            speaker_counts.most_common()
+        ):
+            speaker_ids[speaker] = speaker_id
+    else:
+        _LOGGER.info("Single speaker dataset")
+
+    # Write config
+    audio_quality = args.audio_quality or args.output_dir.name
+    dataset_name = args.dataset_name or args.output_dir.parent.name
+
+    with open(args.output_dir / "config.json", "w", encoding="utf-8") as config_file:
+        json.dump(
+            {
+                "dataset": dataset_name,
+                "audio": {
+                    "sample_rate": args.sample_rate,
+                    "quality": audio_quality,
+                },
+                "espeak": {
+                    "voice": args.language,
+                },
+                "language": {
+                    "code": args.language,
+                },
+                "inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
+                "phoneme_type": args.phoneme_type.value,
+                "phoneme_map": {},
+                "phoneme_id_map": get_codepoints_map()[args.language]
+                if args.phoneme_type == PhonemeType.TEXT
+                else get_espeak_map(),
+                "num_symbols": get_max_phonemes(),
+                "num_speakers": len(speaker_counts),
+                "speaker_id_map": speaker_ids,
+                "piper_version": _VERSION,
+            },
+            config_file,
+            ensure_ascii=False,
+            indent=4,
+        )
+    _LOGGER.info("Wrote dataset config")
+
+    if (args.max_workers is None) or (args.max_workers < 1):
+        args.max_workers = os.cpu_count()
+
+    assert args.max_workers is not None
+
+    batch_size = int(num_utterances / (args.max_workers * 2))
+    queue_in: "Queue[Iterable[Utterance]]" = JoinableQueue()
+    queue_out: "Queue[Optional[Utterance]]" = Queue()
+
+    # Start workers
+    if args.phoneme_type == PhonemeType.TEXT:
+        target = phonemize_batch_text
+    else:
+        target = phonemize_batch_espeak
+
+    processes = [
+        Process(target=target, args=(args, queue_in, queue_out))
+        for _ in range(args.max_workers)
+    ]
+    for proc in processes:
+        proc.start()
+
+    _LOGGER.info(
+        "Processing %s utterance(s) with %s worker(s)", num_utterances, args.max_workers
+    )
+    with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
+        for utt_batch in batched(
+            make_dataset(args),
+            batch_size,
+        ):
+            queue_in.put(utt_batch)
+
+        _LOGGER.debug("Waiting for jobs to finish")
+        missing_phonemes: "Counter[str]" = Counter()
+        for _ in range(num_utterances):
+            utt = queue_out.get()
+            if utt is not None:
+                if utt.speaker is not None:
+                    utt.speaker_id = speaker_ids[utt.speaker]
+
+                utt_dict = dataclasses.asdict(utt)
+                utt_dict.pop("missing_phonemes")
+
+                # JSONL
+                json.dump(
+                    utt_dict,
+                    dataset_file,
+                    ensure_ascii=False,
+                    cls=PathEncoder,
+                )
+                print("", file=dataset_file)
+
+                missing_phonemes.update(utt.missing_phonemes)
+
+        if missing_phonemes:
+            for phoneme, count in missing_phonemes.most_common():
+                _LOGGER.warning("Missing %s (%s)", phoneme, count)
+
+            _LOGGER.warning("Missing %s phoneme(s)", len(missing_phonemes))
+
+    # Signal workers to stop
+    for proc in processes:
+        queue_in.put(None)
+
+    # Wait for workers to stop
+    for proc in processes:
+        proc.join(timeout=1)
+
+
+# -----------------------------------------------------------------------------
+
+
+def get_text_casing(casing: str):
+    if casing == "lower":
+        return str.lower
+
+    if casing == "upper":
+        return str.upper
+
+    if casing == "casefold":
+        return str.casefold
+
+    return lambda s: s
+
+
+def phonemize_batch_espeak(
+    args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
+):
+    try:
+        casing = get_text_casing(args.text_casing)
+        silence_detector = make_silence_detector()
+
+        while True:
+            utt_batch = queue_in.get()
+            if utt_batch is None:
+                break
+
+            for utt in utt_batch:
+                try:
+                    if args.tashkeel:
+                        utt.text = tashkeel_run(utt.text)
+
+                    _LOGGER.debug(utt)
+                    all_phonemes = phonemize_espeak(casing(utt.text), args.language)
+
+                    # Flatten
+                    utt.phonemes = [
+                        phoneme
+                        for sentence_phonemes in all_phonemes
+                        for phoneme in sentence_phonemes
+                    ]
+                    utt.phoneme_ids = phoneme_ids_espeak(
+                        utt.phonemes,
+                        missing_phonemes=utt.missing_phonemes,
+                    )
+                    if not args.skip_audio:
+                        utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
+                            utt.audio_path,
+                            args.cache_dir,
+                            silence_detector,
+                            args.sample_rate,
+                        )
+                    queue_out.put(utt)
+                except TimeoutError:
+                    _LOGGER.error("Skipping utterance due to timeout: %s", utt)
+                except Exception:
+                    _LOGGER.exception("Failed to process utterance: %s", utt)
+                    queue_out.put(None)
+
+            queue_in.task_done()
+    except Exception:
+        _LOGGER.exception("phonemize_batch_espeak")
+
+
+def phonemize_batch_text(
+    args: argparse.Namespace, queue_in: JoinableQueue, queue_out: Queue
+):
+    try:
+        casing = get_text_casing(args.text_casing)
+        silence_detector = make_silence_detector()
+
+        while True:
+            utt_batch = queue_in.get()
+            if utt_batch is None:
+                break
+
+            for utt in utt_batch:
+                try:
+                    if args.tashkeel:
+                        utt.text = tashkeel_run(utt.text)
+
+                    _LOGGER.debug(utt)
+                    all_phonemes = phonemize_codepoints(casing(utt.text))
+                    # Flatten
+                    utt.phonemes = [
+                        phoneme
+                        for sentence_phonemes in all_phonemes
+                        for phoneme in sentence_phonemes
+                    ]
+                    utt.phoneme_ids = phoneme_ids_codepoints(
+                        args.language,
+                        utt.phonemes,
+                        missing_phonemes=utt.missing_phonemes,
+                    )
+                    if not args.skip_audio:
+                        utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
+                            utt.audio_path,
+                            args.cache_dir,
+                            silence_detector,
+                            args.sample_rate,
+                        )
+                    queue_out.put(utt)
+                except TimeoutError:
+                    _LOGGER.error("Skipping utterance due to timeout: %s", utt)
+                except Exception:
+                    _LOGGER.exception("Failed to process utterance: %s", utt)
+                    queue_out.put(None)
+
+            queue_in.task_done()
+    except Exception:
+        _LOGGER.exception("phonemize_batch_text")
+
+
+# -----------------------------------------------------------------------------
+
+
+@dataclass
+class Utterance:
+    text: str
+    audio_path: Path
+    speaker: Optional[str] = None
+    speaker_id: Optional[int] = None
+    phonemes: Optional[List[str]] = None
+    phoneme_ids: Optional[List[int]] = None
+    audio_norm_path: Optional[Path] = None
+    audio_spec_path: Optional[Path] = None
+    missing_phonemes: "Counter[str]" = field(default_factory=Counter)
+
+
+class PathEncoder(json.JSONEncoder):
+    def default(self, o):
+        if isinstance(o, Path):
+            return str(o)
+        return super().default(o)
+
+
+def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
+    dataset_dir = args.input_dir
+    is_single_speaker = args.single_speaker
+    speaker_id = args.speaker_id
+    skip_audio = args.skip_audio
+
+    # filename|speaker|text
+    # speaker is optional
+    metadata_path = dataset_dir / "metadata.csv"
+    assert metadata_path.exists(), f"Missing {metadata_path}"
+
+    wav_dir = dataset_dir / "wav"
+    if not wav_dir.is_dir():
+        wav_dir = dataset_dir / "wavs"
+
+    with open(metadata_path, "r", encoding="utf-8") as csv_file:
+        reader = csv.reader(csv_file, delimiter="|")
+        for row in reader:
+            assert len(row) >= 2, "Not enough columns"
+
+            speaker: Optional[str] = None
+            if is_single_speaker or (len(row) == 2):
+                filename, text = row[0], row[-1]
+            else:
+                filename, speaker, text = row[0], row[1], row[-1]
+
+            # Try file name relative to metadata
+            wav_path = metadata_path.parent / filename
+
+            if not wav_path.exists():
+                # Try with .wav
+                wav_path = metadata_path.parent / f"{filename}.wav"
+
+            if not wav_path.exists():
+                # Try wav/ or wavs/
+                wav_path = wav_dir / filename
+
+            if not wav_path.exists():
+                # Try with .wav
+                wav_path = wav_dir / f"{filename}.wav"
+
+            if not skip_audio:
+                if not wav_path.exists():
+                    _LOGGER.warning("Missing %s", filename)
+                    continue
+
+                if wav_path.stat().st_size == 0:
+                    _LOGGER.warning("Empty file: %s", wav_path)
+                    continue
+
+            yield Utterance(
+                text=text, audio_path=wav_path, speaker=speaker, speaker_id=speaker_id
+            )
+
+
+def mycroft_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
+    dataset_dir = args.input_dir
+    is_single_speaker = args.single_speaker
+    skip_audio = args.skip_audio
+
+    speaker_id = 0
+    for metadata_path in dataset_dir.glob("**/*-metadata.txt"):
+        speaker = metadata_path.parent.name if not is_single_speaker else None
+        with open(metadata_path, "r", encoding="utf-8") as csv_file:
+            # filename|text|length
+            reader = csv.reader(csv_file, delimiter="|")
+            for row in reader:
+                filename, text = row[0], row[1]
+                wav_path = metadata_path.parent / filename
+                if skip_audio or (wav_path.exists() and (wav_path.stat().st_size > 0)):
+                    yield Utterance(
+                        text=text,
+                        audio_path=wav_path,
+                        speaker=speaker,
+                        speaker_id=speaker_id if not is_single_speaker else None,
+                    )
+        speaker_id += 1
+
+
+# -----------------------------------------------------------------------------
+
+
+def batched(iterable, n):
+    "Batch data into lists of length n. The last batch may be shorter."
+    # batched('ABCDEFG', 3) --> ABC DEF G
+    if n < 1:
+        raise ValueError("n must be at least one")
+    it = iter(iterable)
+    batch = list(itertools.islice(it, n))
+    while batch:
+        yield batch
+        batch = list(itertools.islice(it, n))
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/piper_train/py.typed
+++ b/mlu_370-piper/piper/src/python/piper_train/py.typed
--- a/mlu_370-piper/piper/src/python/piper_train/pylintrc
+++ b/mlu_370-piper/piper/src/python/piper_train/pylintrc
@@ -0,0 +1,40 @@
+[MESSAGES CONTROL]
+disable=
+  format,
+  abstract-class-little-used,
+  abstract-method,
+  cyclic-import,
+  duplicate-code,
+  global-statement,
+  import-outside-toplevel,
+  inconsistent-return-statements,
+  locally-disabled,
+  not-context-manager,
+  redefined-variable-type,
+  too-few-public-methods,
+  too-many-arguments,
+  too-many-branches,
+  too-many-instance-attributes,
+  too-many-lines,
+  too-many-locals,
+  too-many-public-methods,
+  too-many-return-statements,
+  too-many-statements,
+  too-many-boolean-expressions,
+  unnecessary-pass,
+  unused-argument,
+  broad-except,
+  too-many-nested-blocks,
+  invalid-name,
+  unused-import,
+  no-self-use,
+  fixme,
+  useless-super-delegation,
+  missing-module-docstring,
+  missing-class-docstring,
+  missing-function-docstring,
+  import-error,
+  relative-beyond-top-level
+
+[FORMAT]
+expected-line-ending-format=LF
--- a/mlu_370-piper/piper/src/python/piper_train/select_speaker.py
+++ b/mlu_370-piper/piper/src/python/piper_train/select_speaker.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import sys
+from collections import Counter, defaultdict
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--speaker-number", type=int)
+    parser.add_argument("--speaker-name")
+    args = parser.parse_args()
+
+    assert (args.speaker_number is not None) or (args.speaker_name is not None)
+
+    reader = csv.reader(sys.stdin, delimiter="|")
+    writer = csv.writer(sys.stdout, delimiter="|")
+
+    if args.speaker_name is not None:
+        for row in reader:
+            audio, speaker_id, text = row[0], row[1], row[-1]
+            if args.speaker_name == speaker_id:
+                writer.writerow((audio, text))
+    else:
+        utterances = defaultdict(list)
+        counts = Counter()
+        for row in reader:
+            audio, speaker_id, text = row[0], row[1], row[-1]
+            utterances[speaker_id].append((audio, text))
+            counts[speaker_id] += 1
+
+        writer = csv.writer(sys.stdout, delimiter="|")
+        for i, (speaker_id, _count) in enumerate(counts.most_common()):
+            if i == args.speaker_number:
+                for row in utterances[speaker_id]:
+                    writer.writerow(row)
+
+                print(speaker_id, file=sys.stderr)
+                break
+
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/piper_train/setup.cfg
+++ b/mlu_370-piper/piper/src/python/piper_train/setup.cfg
@@ -0,0 +1,22 @@
+[flake8]
+# To work with Black
+max-line-length = 88
+# E501: line too long
+# W503: Line break occurred before a binary operator
+# E203: Whitespace before ':'
+# D202 No blank lines allowed after function docstring
+# W504 line break after binary operator
+ignore =
+    E501,
+    W503,
+    E203,
+    D202,
+    W504
+
+[isort]
+multi_line_output = 3
+include_trailing_comma=True
+force_grid_wrap=0
+use_parentheses=True
+line_length=88
+indent = "    "
--- a/mlu_370-piper/piper/src/python/piper_train/vits/init.py
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/init.py
--- a/mlu_370-piper/piper/src/python/piper_train/vits/attentions.py
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/attentions.py
@@ -0,0 +1,427 @@
+import math
+import typing
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .commons import subsequent_mask
+from .modules import LayerNorm
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels: int,
+        filter_channels: int,
+        n_heads: int,
+        n_layers: int,
+        kernel_size: int = 1,
+        p_dropout: float = 0.0,
+        window_size: int = 4,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    window_size=window_size,
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+    def forward(self, x, x_mask):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for attn_layer, norm_layer_1, ffn_layer, norm_layer_2 in zip(
+            self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2
+        ):
+            y = attn_layer(x, x, attn_mask)
+            y = self.drop(y)
+            x = norm_layer_1(x + y)
+
+            y = ffn_layer(x, x_mask)
+            y = self.drop(y)
+            x = norm_layer_2(x + y)
+        x = x * x_mask
+        return x
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels: int,
+        filter_channels: int,
+        n_heads: int,
+        n_layers: int,
+        kernel_size: int = 1,
+        p_dropout: float = 0.0,
+        proximal_bias: bool = False,
+        proximal_init: bool = True,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+
+        self.drop = nn.Dropout(p_dropout)
+        self.self_attn_layers = nn.ModuleList()
+        self.norm_layers_0 = nn.ModuleList()
+        self.encdec_attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.self_attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    proximal_bias=proximal_bias,
+                    proximal_init=proximal_init,
+                )
+            )
+            self.norm_layers_0.append(LayerNorm(hidden_channels))
+            self.encdec_attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                    causal=True,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+    def forward(self, x, x_mask, h, h_mask):
+        """
+        x: decoder input
+        h: encoder output
+        """
+        self_attn_mask = subsequent_mask(x_mask.size(2)).type_as(x)
+        encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.self_attn_layers[i](x, x, self_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_0[i](x + y)
+
+            y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        out_channels: int,
+        n_heads: int,
+        p_dropout: float = 0.0,
+        window_size: typing.Optional[int] = None,
+        heads_share: bool = True,
+        block_length: typing.Optional[int] = None,
+        proximal_bias: bool = False,
+        proximal_init: bool = False,
+    ):
+        super().__init__()
+        assert channels % n_heads == 0
+
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.attn = torch.zeros(1)
+
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+            self.emb_rel_v = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+        if proximal_init:
+            with torch.no_grad():
+                self.conv_k.weight.copy_(self.conv_q.weight)
+                self.conv_k.bias.copy_(self.conv_q.bias)
+
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+
+        x = self.conv_o(x)
+        return x
+
+    def attention(self, query, key, value, mask=None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, t_t = (key.size(0), key.size(1), key.size(2), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+
+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+        if self.window_size is not None:
+            assert (
+                t_s == t_t
+            ), "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(
+                query / math.sqrt(self.k_channels), key_relative_embeddings
+            )
+            scores_local = self._relative_position_to_absolute_position(rel_logits)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + self._attention_bias_proximal(t_s).type_as(scores)
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                assert (
+                    t_s == t_t
+                ), "Local attention is only available for self-attention."
+                block_mask = (
+                    torch.ones_like(scores)
+                    .triu(-self.block_length)
+                    .tril(self.block_length)
+                )
+                scores = scores.masked_fill(block_mask == 0, -1e4)
+        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_v, t_s
+            )
+            output = output + self._matmul_with_relative_values(
+                relative_weights, value_relative_embeddings
+            )
+        output = (
+            output.transpose(2, 3).contiguous().view(b, d, t_t)
+        )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+        return output, p_attn
+
+    def _matmul_with_relative_values(self, x, y):
+        """
+        x: [b, h, l, m]
+        y: [h or 1, m, d]
+        ret: [b, h, l, d]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+
+    def _matmul_with_relative_keys(self, x, y):
+        """
+        x: [b, h, l, d]
+        y: [h or 1, m, d]
+        ret: [b, h, l, m]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+
+    def _get_relative_embeddings(self, relative_embeddings, length: int):
+        # max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                # convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+                (0, 0, pad_length, pad_length, 0, 0),
+            )
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[
+            :, slice_start_position:slice_end_position
+        ]
+        return used_relative_embeddings
+
+    def _relative_position_to_absolute_position(self, x):
+        """
+        x: [b, h, l, 2*l-1]
+        ret: [b, h, l, l]
+        """
+        batch, heads, length, _ = x.size()
+
+        # Concat columns of pad to shift from relative to absolute indexing.
+        # x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+        x = F.pad(x, (0, 1, 0, 0, 0, 0, 0, 0))
+
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        # x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))
+        x_flat = F.pad(x_flat, (0, length - 1, 0, 0, 0, 0))
+
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view([batch, heads, length + 1, (2 * length) - 1])[
+            :, :, :length, length - 1 :
+        ]
+        return x_final
+
+    def _absolute_position_to_relative_position(self, x):
+        """
+        x: [b, h, l, l]
+        ret: [b, h, l, 2*l-1]
+        """
+        batch, heads, length, _ = x.size()
+
+        # padd along column
+        # x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
+        x = F.pad(x, (0, length - 1, 0, 0, 0, 0, 0, 0))
+        x_flat = x.view([batch, heads, (length * length) + (length * (length - 1))])
+        # add 0's in the beginning that will skew the elements after reshape
+        # x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+        x_flat = F.pad(x_flat, (length, 0, 0, 0, 0, 0))
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+
+    def _attention_bias_proximal(self, length: int):
+        """Bias for self-attention to encourage attention to close positions.
+        Args:
+          length: an integer scalar.
+        Returns:
+          a Tensor with shape [1, 1, length, length]
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+
+
+class FFN(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        filter_channels: int,
+        kernel_size: int,
+        p_dropout: float = 0.0,
+        activation: str = "",
+        causal: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+        self.causal = causal
+
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+        self.drop = nn.Dropout(p_dropout)
+
+    def forward(self, x, x_mask):
+        if self.causal:
+            padding1 = self._causal_padding(x * x_mask)
+        else:
+            padding1 = self._same_padding(x * x_mask)
+
+        x = self.conv_1(padding1)
+
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+
+        if self.causal:
+            padding2 = self._causal_padding(x * x_mask)
+        else:
+            padding2 = self._same_padding(x * x_mask)
+
+        x = self.conv_2(padding2)
+
+        return x * x_mask
+
+    def _causal_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = self.kernel_size - 1
+        pad_r = 0
+        # padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        # x = F.pad(x, convert_pad_shape(padding))
+        x = F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
+        return x
+
+    def _same_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = (self.kernel_size - 1) // 2
+        pad_r = self.kernel_size // 2
+        # padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        # x = F.pad(x, convert_pad_shape(padding))
+        x = F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
+        return x
--- a/mlu_370-piper/piper/src/python/piper_train/vits/commons.py
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/commons.py
@@ -0,0 +1,147 @@
+import logging
+import math
+from typing import Optional
+
+import torch
+from torch.nn import functional as F
+
+_LOGGER = logging.getLogger("vits.commons")
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+
+
+def intersperse(lst, item):
+    result = [item] * (len(lst) * 2 + 1)
+    result[1::2] = lst
+    return result
+
+
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+    """KL(P||Q)"""
+    kl = (logs_q - logs_p) - 0.5
+    kl += (
+        0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
+    )
+    return kl
+
+
+def rand_gumbel(shape):
+    """Sample from the Gumbel distribution, protect from overflows."""
+    uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+    return -torch.log(-torch.log(uniform_samples))
+
+
+def rand_gumbel_like(x):
+    g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+    return g
+
+
+def slice_segments(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = max(0, ids_str[i])
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+
+
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = x_lengths - segment_size + 1
+    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    return ret, ids_str
+
+
+def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
+    position = torch.arange(length, dtype=torch.float)
+    num_timescales = channels // 2
+    log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
+        num_timescales - 1
+    )
+    inv_timescales = min_timescale * torch.exp(
+        torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
+    )
+    scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+    signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+    signal = F.pad(signal, [0, 0, 0, channels % 2])
+    signal = signal.view(1, channels, length)
+    return signal
+
+
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return x + signal.to(dtype=x.dtype, device=x.device)
+
+
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+
+
+def subsequent_mask(length: int):
+    mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+    return mask
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+
+
+def sequence_mask(length, max_length: Optional[int] = None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+
+
+def generate_path(duration, mask):
+    """
+    duration: [b, 1, t_x]
+    mask: [b, 1, t_y, t_x]
+    """
+    b, _, t_y, t_x = mask.shape
+    cum_duration = torch.cumsum(duration, -1)
+
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).type_as(mask)
+    path = path.view(b, t_x, t_y)
+    path = path - F.pad(path, (0, 0, 1, 0, 0, 0))[:, :-1]
+    path = path.unsqueeze(1).transpose(2, 3) * mask
+    return path
+
+
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    if clip_value is not None:
+        clip_value = float(clip_value)
+
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+        if clip_value is not None:
+            p.grad.data.clamp_(min=-clip_value, max=clip_value)
+    total_norm = total_norm ** (1.0 / norm_type)
+    return total_norm
--- a/mlu_370-piper/piper/src/python/piper_train/vits/config.py
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/config.py
@@ -0,0 +1,330 @@
+"""Configuration classes"""
+from dataclasses import dataclass, field
+from typing import Optional, Tuple
+
+
+@dataclass
+class MelAudioConfig:
+    filter_length: int = 1024
+    hop_length: int = 256
+    win_length: int = 1024
+    mel_channels: int = 80
+    sample_rate: int = 22050
+    sample_bytes: int = 2
+    channels: int = 1
+    mel_fmin: float = 0.0
+    mel_fmax: Optional[float] = None
+
+
+@dataclass
+class ModelAudioConfig:
+    resblock: str
+    resblock_kernel_sizes: Tuple[int, ...]
+    resblock_dilation_sizes: Tuple[Tuple[int, ...], ...]
+    upsample_rates: Tuple[int, ...]
+    upsample_initial_channel: int
+    upsample_kernel_sizes: Tuple[int, ...]
+
+    @staticmethod
+    def low_quality() -> "ModelAudioConfig":
+        return ModelAudioConfig(
+            resblock="2",
+            resblock_kernel_sizes=(3, 5, 7),
+            resblock_dilation_sizes=(
+                (1, 2),
+                (2, 6),
+                (3, 12),
+            ),
+            upsample_rates=(8, 8, 4),
+            upsample_initial_channel=256,
+            upsample_kernel_sizes=(16, 16, 8),
+        )
+
+    @staticmethod
+    def high_quality() -> "ModelAudioConfig":
+        return ModelAudioConfig(
+            resblock="1",
+            resblock_kernel_sizes=(3, 7, 11),
+            resblock_dilation_sizes=(
+                (1, 3, 5),
+                (1, 3, 5),
+                (1, 3, 5),
+            ),
+            upsample_rates=(8, 8, 2, 2),
+            upsample_initial_channel=512,
+            upsample_kernel_sizes=(16, 16, 4, 4),
+        )
+
+
+@dataclass
+class ModelConfig:
+    num_symbols: int
+    n_speakers: int
+    audio: ModelAudioConfig
+    mel: MelAudioConfig = field(default_factory=MelAudioConfig)
+
+    inter_channels: int = 192
+    hidden_channels: int = 192
+    filter_channels: int = 768
+    n_heads: int = 2
+    n_layers: int = 6
+    kernel_size: int = 3
+    p_dropout: float = 0.1
+    n_layers_q: int = 3
+    use_spectral_norm: bool = False
+    gin_channels: int = 0  # single speaker
+    use_sdp: bool = True  # StochasticDurationPredictor
+    segment_size: int = 8192
+
+    @property
+    def is_multispeaker(self) -> bool:
+        return self.n_speakers > 1
+
+    @property
+    def resblock(self) -> str:
+        return self.audio.resblock
+
+    @property
+    def resblock_kernel_sizes(self) -> Tuple[int, ...]:
+        return self.audio.resblock_kernel_sizes
+
+    @property
+    def resblock_dilation_sizes(self) -> Tuple[Tuple[int, ...], ...]:
+        return self.audio.resblock_dilation_sizes
+
+    @property
+    def upsample_rates(self) -> Tuple[int, ...]:
+        return self.audio.upsample_rates
+
+    @property
+    def upsample_initial_channel(self) -> int:
+        return self.audio.upsample_initial_channel
+
+    @property
+    def upsample_kernel_sizes(self) -> Tuple[int, ...]:
+        return self.audio.upsample_kernel_sizes
+
+    def __post_init__(self):
+        if self.is_multispeaker and (self.gin_channels == 0):
+            self.gin_channels = 512
+
+
+@dataclass
+class TrainingConfig:
+    learning_rate: float = 2e-4
+    betas: Tuple[float, float] = field(default=(0.8, 0.99))
+    eps: float = 1e-9
+    # batch_size: int = 32
+    fp16_run: bool = False
+    lr_decay: float = 0.999875
+    init_lr_ratio: float = 1.0
+    warmup_epochs: int = 0
+    c_mel: int = 45
+    c_kl: float = 1.0
+    grad_clip: Optional[float] = None
+
+
+# @dataclass
+# class PhonemesConfig(DataClassJsonMixin):
+#     phoneme_separator: str = " "
+#     """Separator between individual phonemes in CSV input"""
+
+#     word_separator: str = "#"
+#     """Separator between word phonemes in CSV input (must not match phoneme_separator)"""
+
+#     phoneme_to_id: typing.Optional[typing.Dict[str, int]] = None
+#     pad: typing.Optional[str] = "_"
+#     bos: typing.Optional[str] = None
+#     eos: typing.Optional[str] = None
+#     blank: typing.Optional[str] = "#"
+#     blank_word: typing.Optional[str] = None
+#     blank_between: typing.Union[str, BlankBetween] = BlankBetween.WORDS
+#     blank_at_start: bool = True
+#     blank_at_end: bool = True
+#     simple_punctuation: bool = True
+#     punctuation_map: typing.Optional[typing.Dict[str, str]] = None
+#     separate: typing.Optional[typing.List[str]] = None
+#     separate_graphemes: bool = False
+#     separate_tones: bool = False
+#     tone_before: bool = False
+#     phoneme_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None
+#     auto_bos_eos: bool = False
+#     minor_break: typing.Optional[str] = IPA.BREAK_MINOR.value
+#     major_break: typing.Optional[str] = IPA.BREAK_MAJOR.value
+#     break_phonemes_into_graphemes: bool = False
+#     break_phonemes_into_codepoints: bool = False
+#     drop_stress: bool = False
+#     symbols: typing.Optional[typing.List[str]] = None
+
+#     def split_word_phonemes(self, phonemes_str: str) -> typing.List[typing.List[str]]:
+#         """Split phonemes string into a list of lists (outer is words, inner is individual phonemes in each word)"""
+#         return [
+#             word_phonemes_str.split(self.phoneme_separator)
+#             if self.phoneme_separator
+#             else list(word_phonemes_str)
+#             for word_phonemes_str in phonemes_str.split(self.word_separator)
+#         ]
+
+#     def join_word_phonemes(self, word_phonemes: typing.List[typing.List[str]]) -> str:
+#         """Split phonemes string into a list of lists (outer is words, inner is individual phonemes in each word)"""
+#         return self.word_separator.join(
+#             self.phoneme_separator.join(wp) for wp in word_phonemes
+#         )
+
+
+# class Phonemizer(str, Enum):
+#     SYMBOLS = "symbols"
+#     GRUUT = "gruut"
+#     ESPEAK = "espeak"
+#     EPITRAN = "epitran"
+
+
+# class Aligner(str, Enum):
+#     KALDI_ALIGN = "kaldi_align"
+
+
+# class TextCasing(str, Enum):
+#     LOWER = "lower"
+#     UPPER = "upper"
+
+
+# class MetadataFormat(str, Enum):
+#     TEXT = "text"
+#     PHONEMES = "phonemes"
+#     PHONEME_IDS = "ids"
+
+
+# @dataclass
+# class DatasetConfig:
+#     name: str
+#     metadata_format: MetadataFormat = MetadataFormat.TEXT
+#     multispeaker: bool = False
+#     text_language: typing.Optional[str] = None
+#     audio_dir: typing.Optional[typing.Union[str, Path]] = None
+#     cache_dir: typing.Optional[typing.Union[str, Path]] = None
+
+#     def get_cache_dir(self, output_dir: typing.Union[str, Path]) -> Path:
+#         if self.cache_dir is not None:
+#             cache_dir = Path(self.cache_dir)
+#         else:
+#             cache_dir = Path("cache") / self.name
+
+#         if not cache_dir.is_absolute():
+#             cache_dir = Path(output_dir) / str(cache_dir)
+
+#         return cache_dir
+
+
+# @dataclass
+# class AlignerConfig:
+#     aligner: typing.Optional[Aligner] = None
+#     casing: typing.Optional[TextCasing] = None
+
+
+# @dataclass
+# class InferenceConfig:
+#     length_scale: float = 1.0
+#     noise_scale: float = 0.667
+#     noise_w: float = 0.8
+
+
+# @dataclass
+# class TrainingConfig(DataClassJsonMixin):
+#     seed: int = 1234
+#     epochs: int = 10000
+#     learning_rate: float = 2e-4
+#     betas: typing.Tuple[float, float] = field(default=(0.8, 0.99))
+#     eps: float = 1e-9
+#     batch_size: int = 32
+#     fp16_run: bool = False
+#     lr_decay: float = 0.999875
+#     segment_size: int = 8192
+#     init_lr_ratio: float = 1.0
+#     warmup_epochs: int = 0
+#     c_mel: int = 45
+#     c_kl: float = 1.0
+#     grad_clip: typing.Optional[float] = None
+
+#     min_seq_length: typing.Optional[int] = None
+#     max_seq_length: typing.Optional[int] = None
+
+#     min_spec_length: typing.Optional[int] = None
+#     max_spec_length: typing.Optional[int] = None
+
+#     min_speaker_utterances: typing.Optional[int] = None
+
+#     last_epoch: int = 1
+#     global_step: int = 1
+#     best_loss: typing.Optional[float] = None
+#     audio: AudioConfig = field(default_factory=AudioConfig)
+#     model: ModelConfig = field(default_factory=ModelConfig)
+#     phonemes: PhonemesConfig = field(default_factory=PhonemesConfig)
+#     text_aligner: AlignerConfig = field(default_factory=AlignerConfig)
+#     text_language: typing.Optional[str] = None
+#     phonemizer: typing.Optional[Phonemizer] = None
+#     datasets: typing.List[DatasetConfig] = field(default_factory=list)
+#     inference: InferenceConfig = field(default_factory=InferenceConfig)
+
+#     version: int = 1
+#     git_commit: str = ""
+
+#     @property
+#     def is_multispeaker(self):
+#         return self.model.is_multispeaker or any(d.multispeaker for d in self.datasets)
+
+#     def save(self, config_file: typing.TextIO):
+#         """Save config as JSON to a file"""
+#         json.dump(self.to_dict(), config_file, indent=4)
+
+#     def get_speaker_id(self, dataset_name: str, speaker_name: str) -> int:
+#         if self.speaker_id_map is None:
+#             self.speaker_id_map = {}
+
+#         full_speaker_name = f"{dataset_name}_{speaker_name}"
+#         speaker_id = self.speaker_id_map.get(full_speaker_name)
+#         if speaker_id is None:
+#             speaker_id = len(self.speaker_id_map)
+#             self.speaker_id_map[full_speaker_name] = speaker_id
+
+#         return speaker_id
+
+#     @staticmethod
+#     def load(config_file: typing.TextIO) -> "TrainingConfig":
+#         """Load config from a JSON file"""
+#         return TrainingConfig.from_json(config_file.read())
+
+#     @staticmethod
+#     def load_and_merge(
+#         config: "TrainingConfig",
+#         config_files: typing.Iterable[typing.Union[str, Path, typing.TextIO]],
+#     ) -> "TrainingConfig":
+#         """Loads one or more JSON configuration files and overlays them on top of an existing config"""
+#         base_dict = config.to_dict()
+#         for maybe_config_file in config_files:
+#             if isinstance(maybe_config_file, (str, Path)):
+#                 # File path
+#                 config_file = open(maybe_config_file, "r", encoding="utf-8")
+#             else:
+#                 # File object
+#                 config_file = maybe_config_file
+
+#             with config_file:
+#                 # Load new config and overlay on existing config
+#                 new_dict = json.load(config_file)
+#                 TrainingConfig.recursive_update(base_dict, new_dict)
+
+#         return TrainingConfig.from_dict(base_dict)
+
+#     @staticmethod
+#     def recursive_update(
+#         base_dict: typing.Dict[typing.Any, typing.Any],
+#         new_dict: typing.Mapping[typing.Any, typing.Any],
+#     ) -> None:
+#         """Recursively overwrites values in base dictionary with values from new dictionary"""
+#         for key, value in new_dict.items():
+#             if isinstance(value, collections.Mapping) and (
+#                 base_dict.get(key) is not None
+#             ):
+#                 TrainingConfig.recursive_update(base_dict[key], value)
+#             else:
+#                 base_dict[key] = value
--- a/mlu_370-piper/piper/src/python/piper_train/vits/dataset.py
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/dataset.py
@@ -0,0 +1,214 @@
+import json
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, List, Optional, Sequence, Union
+
+import torch
+from torch import FloatTensor, LongTensor
+from torch.utils.data import Dataset
+
+_LOGGER = logging.getLogger("vits.dataset")
+
+
+@dataclass
+class Utterance:
+    phoneme_ids: List[int]
+    audio_norm_path: Path
+    audio_spec_path: Path
+    speaker_id: Optional[int] = None
+    text: Optional[str] = None
+
+
+@dataclass
+class UtteranceTensors:
+    phoneme_ids: LongTensor
+    spectrogram: FloatTensor
+    audio_norm: FloatTensor
+    speaker_id: Optional[LongTensor] = None
+    text: Optional[str] = None
+
+    @property
+    def spec_length(self) -> int:
+        return self.spectrogram.size(1)
+
+
+@dataclass
+class Batch:
+    phoneme_ids: LongTensor
+    phoneme_lengths: LongTensor
+    spectrograms: FloatTensor
+    spectrogram_lengths: LongTensor
+    audios: FloatTensor
+    audio_lengths: LongTensor
+    speaker_ids: Optional[LongTensor] = None
+
+
+class PiperDataset(Dataset):
+    """
+    Dataset format:
+
+    * phoneme_ids (required)
+    * audio_norm_path (required)
+    * audio_spec_path (required)
+    * text (optional)
+    * phonemes (optional)
+    * audio_path (optional)
+    """
+
+    def __init__(
+        self,
+        dataset_paths: List[Union[str, Path]],
+        max_phoneme_ids: Optional[int] = None,
+    ):
+        self.utterances: List[Utterance] = []
+
+        for dataset_path in dataset_paths:
+            dataset_path = Path(dataset_path)
+            _LOGGER.debug("Loading dataset: %s", dataset_path)
+            self.utterances.extend(
+                PiperDataset.load_dataset(dataset_path, max_phoneme_ids=max_phoneme_ids)
+            )
+
+    def __len__(self):
+        return len(self.utterances)
+
+    def __getitem__(self, idx) -> UtteranceTensors:
+        utt = self.utterances[idx]
+        return UtteranceTensors(
+            phoneme_ids=LongTensor(utt.phoneme_ids),
+            audio_norm=torch.load(utt.audio_norm_path),
+            spectrogram=torch.load(utt.audio_spec_path),
+            speaker_id=LongTensor([utt.speaker_id])
+            if utt.speaker_id is not None
+            else None,
+            text=utt.text,
+        )
+
+    @staticmethod
+    def load_dataset(
+        dataset_path: Path,
+        max_phoneme_ids: Optional[int] = None,
+    ) -> Iterable[Utterance]:
+        num_skipped = 0
+
+        with open(dataset_path, "r", encoding="utf-8") as dataset_file:
+            for line_idx, line in enumerate(dataset_file):
+                line = line.strip()
+                if not line:
+                    continue
+
+                try:
+                    utt = PiperDataset.load_utterance(line)
+                    if (max_phoneme_ids is None) or (
+                        len(utt.phoneme_ids) <= max_phoneme_ids
+                    ):
+                        yield utt
+                    else:
+                        num_skipped += 1
+                except Exception:
+                    _LOGGER.exception(
+                        "Error on line %s of %s: %s",
+                        line_idx + 1,
+                        dataset_path,
+                        line,
+                    )
+
+        if num_skipped > 0:
+            _LOGGER.warning("Skipped %s utterance(s)", num_skipped)
+
+    @staticmethod
+    def load_utterance(line: str) -> Utterance:
+        utt_dict = json.loads(line)
+        return Utterance(
+            phoneme_ids=utt_dict["phoneme_ids"],
+            audio_norm_path=Path(utt_dict["audio_norm_path"]),
+            audio_spec_path=Path(utt_dict["audio_spec_path"]),
+            speaker_id=utt_dict.get("speaker_id"),
+            text=utt_dict.get("text"),
+        )
+
+
+class UtteranceCollate:
+    def __init__(self, is_multispeaker: bool, segment_size: int):
+        self.is_multispeaker = is_multispeaker
+        self.segment_size = segment_size
+
+    def __call__(self, utterances: Sequence[UtteranceTensors]) -> Batch:
+        num_utterances = len(utterances)
+        assert num_utterances > 0, "No utterances"
+
+        max_phonemes_length = 0
+        max_spec_length = 0
+        max_audio_length = 0
+
+        num_mels = 0
+
+        # Determine lengths
+        for utt_idx, utt in enumerate(utterances):
+            assert utt.spectrogram is not None
+            assert utt.audio_norm is not None
+
+            phoneme_length = utt.phoneme_ids.size(0)
+            spec_length = utt.spectrogram.size(1)
+            audio_length = utt.audio_norm.size(1)
+
+            max_phonemes_length = max(max_phonemes_length, phoneme_length)
+            max_spec_length = max(max_spec_length, spec_length)
+            max_audio_length = max(max_audio_length, audio_length)
+
+            num_mels = utt.spectrogram.size(0)
+            if self.is_multispeaker:
+                assert utt.speaker_id is not None, "Missing speaker id"
+
+        # Audio cannot be smaller than segment size (8192)
+        max_audio_length = max(max_audio_length, self.segment_size)
+
+        # Create padded tensors
+        phonemes_padded = LongTensor(num_utterances, max_phonemes_length)
+        spec_padded = FloatTensor(num_utterances, num_mels, max_spec_length)
+        audio_padded = FloatTensor(num_utterances, 1, max_audio_length)
+
+        phonemes_padded.zero_()
+        spec_padded.zero_()
+        audio_padded.zero_()
+
+        phoneme_lengths = LongTensor(num_utterances)
+        spec_lengths = LongTensor(num_utterances)
+        audio_lengths = LongTensor(num_utterances)
+
+        speaker_ids: Optional[LongTensor] = None
+        if self.is_multispeaker:
+            speaker_ids = LongTensor(num_utterances)
+
+        # Sort by decreasing spectrogram length
+        sorted_utterances = sorted(
+            utterances, key=lambda u: u.spectrogram.size(1), reverse=True
+        )
+        for utt_idx, utt in enumerate(sorted_utterances):
+            phoneme_length = utt.phoneme_ids.size(0)
+            spec_length = utt.spectrogram.size(1)
+            audio_length = utt.audio_norm.size(1)
+
+            phonemes_padded[utt_idx, :phoneme_length] = utt.phoneme_ids
+            phoneme_lengths[utt_idx] = phoneme_length
+
+            spec_padded[utt_idx, :, :spec_length] = utt.spectrogram
+            spec_lengths[utt_idx] = spec_length
+
+            audio_padded[utt_idx, :, :audio_length] = utt.audio_norm
+            audio_lengths[utt_idx] = audio_length
+
+            if utt.speaker_id is not None:
+                assert speaker_ids is not None
+                speaker_ids[utt_idx] = utt.speaker_id
+
+        return Batch(
+            phoneme_ids=phonemes_padded,
+            phoneme_lengths=phoneme_lengths,
+            spectrograms=spec_padded,
+            spectrogram_lengths=spec_lengths,
+            audios=audio_padded,
+            audio_lengths=audio_lengths,
+            speaker_ids=speaker_ids,
+        )
--- a/mlu_370-piper/piper/src/python/piper_train/vits/lightning.py
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/lightning.py
@@ -0,0 +1,352 @@
+import logging
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+
+import pytorch_lightning as pl
+import torch
+from torch import autocast
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, Dataset, random_split
+
+from .commons import slice_segments
+from .dataset import Batch, PiperDataset, UtteranceCollate
+from .losses import discriminator_loss, feature_loss, generator_loss, kl_loss
+from .mel_processing import mel_spectrogram_torch, spec_to_mel_torch
+from .models import MultiPeriodDiscriminator, SynthesizerTrn
+
+_LOGGER = logging.getLogger("vits.lightning")
+
+
+class VitsModel(pl.LightningModule):
+    def __init__(
+        self,
+        num_symbols: int,
+        num_speakers: int,
+        # audio
+        resblock="2",
+        resblock_kernel_sizes=(3, 5, 7),
+        resblock_dilation_sizes=(
+            (1, 2),
+            (2, 6),
+            (3, 12),
+        ),
+        upsample_rates=(8, 8, 4),
+        upsample_initial_channel=256,
+        upsample_kernel_sizes=(16, 16, 8),
+        # mel
+        filter_length: int = 1024,
+        hop_length: int = 256,
+        win_length: int = 1024,
+        mel_channels: int = 80,
+        sample_rate: int = 22050,
+        sample_bytes: int = 2,
+        channels: int = 1,
+        mel_fmin: float = 0.0,
+        mel_fmax: Optional[float] = None,
+        # model
+        inter_channels: int = 192,
+        hidden_channels: int = 192,
+        filter_channels: int = 768,
+        n_heads: int = 2,
+        n_layers: int = 6,
+        kernel_size: int = 3,
+        p_dropout: float = 0.1,
+        n_layers_q: int = 3,
+        use_spectral_norm: bool = False,
+        gin_channels: int = 0,
+        use_sdp: bool = True,
+        segment_size: int = 8192,
+        # training
+        dataset: Optional[List[Union[str, Path]]] = None,
+        learning_rate: float = 2e-4,
+        betas: Tuple[float, float] = (0.8, 0.99),
+        eps: float = 1e-9,
+        batch_size: int = 1,
+        lr_decay: float = 0.999875,
+        init_lr_ratio: float = 1.0,
+        warmup_epochs: int = 0,
+        c_mel: int = 45,
+        c_kl: float = 1.0,
+        grad_clip: Optional[float] = None,
+        num_workers: int = 1,
+        seed: int = 1234,
+        num_test_examples: int = 5,
+        validation_split: float = 0.1,
+        max_phoneme_ids: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.save_hyperparameters()
+
+        if (self.hparams.num_speakers > 1) and (self.hparams.gin_channels <= 0):
+            # Default gin_channels for multi-speaker model
+            self.hparams.gin_channels = 512
+
+        # Set up models
+        self.model_g = SynthesizerTrn(
+            n_vocab=self.hparams.num_symbols,
+            spec_channels=self.hparams.filter_length // 2 + 1,
+            segment_size=self.hparams.segment_size // self.hparams.hop_length,
+            inter_channels=self.hparams.inter_channels,
+            hidden_channels=self.hparams.hidden_channels,
+            filter_channels=self.hparams.filter_channels,
+            n_heads=self.hparams.n_heads,
+            n_layers=self.hparams.n_layers,
+            kernel_size=self.hparams.kernel_size,
+            p_dropout=self.hparams.p_dropout,
+            resblock=self.hparams.resblock,
+            resblock_kernel_sizes=self.hparams.resblock_kernel_sizes,
+            resblock_dilation_sizes=self.hparams.resblock_dilation_sizes,
+            upsample_rates=self.hparams.upsample_rates,
+            upsample_initial_channel=self.hparams.upsample_initial_channel,
+            upsample_kernel_sizes=self.hparams.upsample_kernel_sizes,
+            n_speakers=self.hparams.num_speakers,
+            gin_channels=self.hparams.gin_channels,
+            use_sdp=self.hparams.use_sdp,
+        )
+        self.model_d = MultiPeriodDiscriminator(
+            use_spectral_norm=self.hparams.use_spectral_norm
+        )
+
+        # Dataset splits
+        self._train_dataset: Optional[Dataset] = None
+        self._val_dataset: Optional[Dataset] = None
+        self._test_dataset: Optional[Dataset] = None
+        self._load_datasets(validation_split, num_test_examples, max_phoneme_ids)
+
+        # State kept between training optimizers
+        self._y = None
+        self._y_hat = None
+
+    def _load_datasets(
+        self,
+        validation_split: float,
+        num_test_examples: int,
+        max_phoneme_ids: Optional[int] = None,
+    ):
+        if self.hparams.dataset is None:
+            _LOGGER.debug("No dataset to load")
+            return
+
+        full_dataset = PiperDataset(
+            self.hparams.dataset, max_phoneme_ids=max_phoneme_ids
+        )
+        valid_set_size = int(len(full_dataset) * validation_split)
+        train_set_size = len(full_dataset) - valid_set_size - num_test_examples
+
+        self._train_dataset, self._test_dataset, self._val_dataset = random_split(
+            full_dataset, [train_set_size, num_test_examples, valid_set_size]
+        )
+
+    def forward(self, text, text_lengths, scales, sid=None):
+        noise_scale = scales[0]
+        length_scale = scales[1]
+        noise_scale_w = scales[2]
+        audio, *_ = self.model_g.infer(
+            text,
+            text_lengths,
+            noise_scale=noise_scale,
+            length_scale=length_scale,
+            noise_scale_w=noise_scale_w,
+            sid=sid,
+        )
+
+        return audio
+
+    def train_dataloader(self):
+        return DataLoader(
+            self._train_dataset,
+            collate_fn=UtteranceCollate(
+                is_multispeaker=self.hparams.num_speakers > 1,
+                segment_size=self.hparams.segment_size,
+            ),
+            num_workers=self.hparams.num_workers,
+            batch_size=self.hparams.batch_size,
+        )
+
+    def val_dataloader(self):
+        return DataLoader(
+            self._val_dataset,
+            collate_fn=UtteranceCollate(
+                is_multispeaker=self.hparams.num_speakers > 1,
+                segment_size=self.hparams.segment_size,
+            ),
+            num_workers=self.hparams.num_workers,
+            batch_size=self.hparams.batch_size,
+        )
+
+    def test_dataloader(self):
+        return DataLoader(
+            self._test_dataset,
+            collate_fn=UtteranceCollate(
+                is_multispeaker=self.hparams.num_speakers > 1,
+                segment_size=self.hparams.segment_size,
+            ),
+            num_workers=self.hparams.num_workers,
+            batch_size=self.hparams.batch_size,
+        )
+
+    def training_step(self, batch: Batch, batch_idx: int, optimizer_idx: int):
+        if optimizer_idx == 0:
+            return self.training_step_g(batch)
+
+        if optimizer_idx == 1:
+            return self.training_step_d(batch)
+
+    def training_step_g(self, batch: Batch):
+        x, x_lengths, y, _, spec, spec_lengths, speaker_ids = (
+            batch.phoneme_ids,
+            batch.phoneme_lengths,
+            batch.audios,
+            batch.audio_lengths,
+            batch.spectrograms,
+            batch.spectrogram_lengths,
+            batch.speaker_ids if batch.speaker_ids is not None else None,
+        )
+        (
+            y_hat,
+            l_length,
+            _attn,
+            ids_slice,
+            _x_mask,
+            z_mask,
+            (_z, z_p, m_p, logs_p, _m_q, logs_q),
+        ) = self.model_g(x, x_lengths, spec, spec_lengths, speaker_ids)
+        self._y_hat = y_hat
+
+        mel = spec_to_mel_torch(
+            spec,
+            self.hparams.filter_length,
+            self.hparams.mel_channels,
+            self.hparams.sample_rate,
+            self.hparams.mel_fmin,
+            self.hparams.mel_fmax,
+        )
+        y_mel = slice_segments(
+            mel,
+            ids_slice,
+            self.hparams.segment_size // self.hparams.hop_length,
+        )
+        y_hat_mel = mel_spectrogram_torch(
+            y_hat.squeeze(1),
+            self.hparams.filter_length,
+            self.hparams.mel_channels,
+            self.hparams.sample_rate,
+            self.hparams.hop_length,
+            self.hparams.win_length,
+            self.hparams.mel_fmin,
+            self.hparams.mel_fmax,
+        )
+        y = slice_segments(
+            y,
+            ids_slice * self.hparams.hop_length,
+            self.hparams.segment_size,
+        )  # slice
+
+        # Save for training_step_d
+        self._y = y
+
+        _y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = self.model_d(y, y_hat)
+
+        with autocast(self.device.type, enabled=False):
+            # Generator loss
+            loss_dur = torch.sum(l_length.float())
+            loss_mel = F.l1_loss(y_mel, y_hat_mel) * self.hparams.c_mel
+            loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * self.hparams.c_kl
+
+            loss_fm = feature_loss(fmap_r, fmap_g)
+            loss_gen, _losses_gen = generator_loss(y_d_hat_g)
+            loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl
+
+            self.log("loss_gen_all", loss_gen_all)
+
+            return loss_gen_all
+
+    def training_step_d(self, batch: Batch):
+        # From training_step_g
+        y = self._y
+        y_hat = self._y_hat
+        y_d_hat_r, y_d_hat_g, _, _ = self.model_d(y, y_hat.detach())
+
+        with autocast(self.device.type, enabled=False):
+            # Discriminator
+            loss_disc, _losses_disc_r, _losses_disc_g = discriminator_loss(
+                y_d_hat_r, y_d_hat_g
+            )
+            loss_disc_all = loss_disc
+
+            self.log("loss_disc_all", loss_disc_all)
+
+            return loss_disc_all
+
+    def validation_step(self, batch: Batch, batch_idx: int):
+        val_loss = self.training_step_g(batch) + self.training_step_d(batch)
+        self.log("val_loss", val_loss)
+
+        # Generate audio examples
+        for utt_idx, test_utt in enumerate(self._test_dataset):
+            text = test_utt.phoneme_ids.unsqueeze(0).to(self.device)
+            text_lengths = torch.LongTensor([len(test_utt.phoneme_ids)]).to(self.device)
+            scales = [0.667, 1.0, 0.8]
+            sid = (
+                test_utt.speaker_id.to(self.device)
+                if test_utt.speaker_id is not None
+                else None
+            )
+            test_audio = self(text, text_lengths, scales, sid=sid).detach()
+
+            # Scale to make louder in [-1, 1]
+            test_audio = test_audio * (1.0 / max(0.01, abs(test_audio.max())))
+
+            tag = test_utt.text or str(utt_idx)
+            self.logger.experiment.add_audio(
+                tag, test_audio, sample_rate=self.hparams.sample_rate
+            )
+
+        return val_loss
+
+    def configure_optimizers(self):
+        optimizers = [
+            torch.optim.AdamW(
+                self.model_g.parameters(),
+                lr=self.hparams.learning_rate,
+                betas=self.hparams.betas,
+                eps=self.hparams.eps,
+            ),
+            torch.optim.AdamW(
+                self.model_d.parameters(),
+                lr=self.hparams.learning_rate,
+                betas=self.hparams.betas,
+                eps=self.hparams.eps,
+            ),
+        ]
+        schedulers = [
+            torch.optim.lr_scheduler.ExponentialLR(
+                optimizers[0], gamma=self.hparams.lr_decay
+            ),
+            torch.optim.lr_scheduler.ExponentialLR(
+                optimizers[1], gamma=self.hparams.lr_decay
+            ),
+        ]
+
+        return optimizers, schedulers
+
+    @staticmethod
+    def add_model_specific_args(parent_parser):
+        parser = parent_parser.add_argument_group("VitsModel")
+        parser.add_argument("--batch-size", type=int, required=True)
+        parser.add_argument("--validation-split", type=float, default=0.1)
+        parser.add_argument("--num-test-examples", type=int, default=5)
+        parser.add_argument(
+            "--max-phoneme-ids",
+            type=int,
+            help="Exclude utterances with phoneme id lists longer than this",
+        )
+        #
+        parser.add_argument("--hidden-channels", type=int, default=192)
+        parser.add_argument("--inter-channels", type=int, default=192)
+        parser.add_argument("--filter-channels", type=int, default=768)
+        parser.add_argument("--n-layers", type=int, default=6)
+        parser.add_argument("--n-heads", type=int, default=2)
+        #
+        return parent_parser
--- a/mlu_370-piper/piper/src/python/piper_train/vits/losses.py
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/losses.py
@@ -0,0 +1,58 @@
+import torch
+
+
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            rl = rl.float().detach()
+            gl = gl.float()
+            loss += torch.mean(torch.abs(rl - gl))
+
+    return loss * 2
+
+
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        dr = dr.float()
+        dg = dg.float()
+        r_loss = torch.mean((1 - dr) ** 2)
+        g_loss = torch.mean(dg**2)
+        loss += r_loss + g_loss
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+
+    return loss, r_losses, g_losses
+
+
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        dg = dg.float()
+        l_dg = torch.mean((1 - dg) ** 2)
+        gen_losses.append(l_dg)
+        loss += l_dg
+
+    return loss, gen_losses
+
+
+def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
+    """
+    z_p, logs_q: [b, h, t_t]
+    m_p, logs_p: [b, h, t_t]
+    """
+    z_p = z_p.float()
+    logs_q = logs_q.float()
+    m_p = m_p.float()
+    logs_p = logs_p.float()
+    z_mask = z_mask.float()
+
+    kl = logs_p - logs_q - 0.5
+    kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
+    kl = torch.sum(kl * z_mask)
+    l_kl = kl / torch.sum(z_mask)
+    return l_kl
--- a/mlu_370-piper/piper/src/python/piper_train/vits/mel_processing.py
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/mel_processing.py
@@ -0,0 +1,139 @@
+import torch
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+
+MAX_WAV_VALUE = 32768.0
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression_torch(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+
+
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+
+
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+
+
+mel_basis = {}
+hann_window = {}
+
+
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+
+    global hann_window
+    dtype_device = str(y.dtype) + "_" + str(y.device)
+    wnsize_dtype_device = str(win_size) + "_" + dtype_device
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).type_as(y)
+
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+
+    spec = torch.view_as_real(
+        torch.stft(
+            y,
+            n_fft,
+            hop_length=hop_size,
+            win_length=win_size,
+            window=hann_window[wnsize_dtype_device],
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+    )
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+
+    return spec
+
+
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+    global mel_basis
+    dtype_device = str(spec.dtype) + "_" + str(spec.device)
+    fmax_dtype_device = str(fmax) + "_" + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(
+            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
+        )
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).type_as(spec)
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+
+
+def mel_spectrogram_torch(
+    y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
+):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+
+    global mel_basis, hann_window
+    dtype_device = str(y.dtype) + "_" + str(y.device)
+    fmax_dtype_device = str(fmax) + "_" + dtype_device
+    wnsize_dtype_device = str(win_size) + "_" + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(
+            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
+        )
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).type_as(y)
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).type_as(y)
+
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+    spec = torch.view_as_real(
+        torch.stft(
+            y,
+            n_fft,
+            hop_length=hop_size,
+            win_length=win_size,
+            window=hann_window[wnsize_dtype_device],
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+    )
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+
+    return spec
--- a/mlu_370-piper/piper/src/python/piper_train/vits/models.py
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/models.py
@@ -0,0 +1,732 @@
+import math
+import typing
+
+import torch
+from torch import nn
+from torch.nn import Conv1d, Conv2d, ConvTranspose1d
+from torch.nn import functional as F
+from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
+
+from . import attentions, commons, modules, monotonic_align
+from .commons import get_padding, init_weights
+
+
+class StochasticDurationPredictor(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        filter_channels: int,
+        kernel_size: int,
+        p_dropout: float,
+        n_flows: int = 4,
+        gin_channels: int = 0,
+    ):
+        super().__init__()
+        filter_channels = in_channels  # it needs to be removed from future version.
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+
+        self.log_flow = modules.Log()
+        self.flows = nn.ModuleList()
+        self.flows.append(modules.ElementwiseAffine(2))
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
+            )
+            self.flows.append(modules.Flip())
+
+        self.post_pre = nn.Conv1d(1, filter_channels, 1)
+        self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.post_convs = modules.DDSConv(
+            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
+        )
+        self.post_flows = nn.ModuleList()
+        self.post_flows.append(modules.ElementwiseAffine(2))
+        for i in range(4):
+            self.post_flows.append(
+                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
+            )
+            self.post_flows.append(modules.Flip())
+
+        self.pre = nn.Conv1d(in_channels, filter_channels, 1)
+        self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.convs = modules.DDSConv(
+            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
+        )
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
+
+    def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
+        x = torch.detach(x)
+        x = self.pre(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.convs(x, x_mask)
+        x = self.proj(x) * x_mask
+
+        if not reverse:
+            flows = self.flows
+            assert w is not None
+
+            logdet_tot_q = 0
+            h_w = self.post_pre(w)
+            h_w = self.post_convs(h_w, x_mask)
+            h_w = self.post_proj(h_w) * x_mask
+            e_q = torch.randn(w.size(0), 2, w.size(2)).type_as(x) * x_mask
+            z_q = e_q
+            for flow in self.post_flows:
+                z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
+                logdet_tot_q += logdet_q
+            z_u, z1 = torch.split(z_q, [1, 1], 1)
+            u = torch.sigmoid(z_u) * x_mask
+            z0 = (w - u) * x_mask
+            logdet_tot_q += torch.sum(
+                (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
+            )
+            logq = (
+                torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2])
+                - logdet_tot_q
+            )
+
+            logdet_tot = 0
+            z0, logdet = self.log_flow(z0, x_mask)
+            logdet_tot += logdet
+            z = torch.cat([z0, z1], 1)
+            for flow in flows:
+                z, logdet = flow(z, x_mask, g=x, reverse=reverse)
+                logdet_tot = logdet_tot + logdet
+            nll = (
+                torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2])
+                - logdet_tot
+            )
+            return nll + logq  # [b]
+        else:
+            flows = list(reversed(self.flows))
+            flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
+            z = torch.randn(x.size(0), 2, x.size(2)).type_as(x) * noise_scale
+
+            for flow in flows:
+                z = flow(z, x_mask, g=x, reverse=reverse)
+            z0, z1 = torch.split(z, [1, 1], 1)
+            logw = z0
+            return logw
+
+
+class DurationPredictor(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        filter_channels: int,
+        kernel_size: int,
+        p_dropout: float,
+        gin_channels: int = 0,
+    ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.gin_channels = gin_channels
+
+        self.drop = nn.Dropout(p_dropout)
+        self.conv_1 = nn.Conv1d(
+            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_1 = modules.LayerNorm(filter_channels)
+        self.conv_2 = nn.Conv1d(
+            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_2 = modules.LayerNorm(filter_channels)
+        self.proj = nn.Conv1d(filter_channels, 1, 1)
+
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+
+    def forward(self, x, x_mask, g=None):
+        x = torch.detach(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_1(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_2(x)
+        x = self.drop(x)
+        x = self.proj(x * x_mask)
+        return x * x_mask
+
+
+class TextEncoder(nn.Module):
+    def __init__(
+        self,
+        n_vocab: int,
+        out_channels: int,
+        hidden_channels: int,
+        filter_channels: int,
+        n_heads: int,
+        n_layers: int,
+        kernel_size: int,
+        p_dropout: float,
+    ):
+        super().__init__()
+        self.n_vocab = n_vocab
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+
+        self.emb = nn.Embedding(n_vocab, hidden_channels)
+        nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
+
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, x, x_lengths):
+        x = self.emb(x) * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(
+            commons.sequence_mask(x_lengths, x.size(2)), 1
+        ).type_as(x)
+
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return x, m, logs, x_mask
+
+
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        hidden_channels: int,
+        kernel_size: int,
+        dilation_rate: int,
+        n_layers: int,
+        n_flows: int = 4,
+        gin_channels: int = 0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+
+
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        hidden_channels: int,
+        kernel_size: int,
+        dilation_rate: int,
+        n_layers: int,
+        gin_channels: int = 0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(
+            commons.sequence_mask(x_lengths, x.size(2)), 1
+        ).type_as(x)
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+
+
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel: int,
+        resblock: typing.Optional[str],
+        resblock_kernel_sizes: typing.Tuple[int, ...],
+        resblock_dilation_sizes: typing.Tuple[typing.Tuple[int, ...], ...],
+        upsample_rates: typing.Tuple[int, ...],
+        upsample_initial_channel: int,
+        upsample_kernel_sizes: typing.Tuple[int, ...],
+        gin_channels: int = 0,
+    ):
+        super(Generator, self).__init__()
+        self.LRELU_SLOPE = 0.1
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock_module = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock_module(ch, k, d))
+
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+
+        for i, up in enumerate(self.ups):
+            x = F.leaky_relu(x, self.LRELU_SLOPE)
+            x = up(x)
+            xs = torch.zeros(1)
+            for j, resblock in enumerate(self.resblocks):
+                index = j - (i * self.num_kernels)
+                if index == 0:
+                    xs = resblock(x)
+                elif (index > 0) and (index < self.num_kernels):
+                    xs += resblock(x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        print("Removing weight norm...")
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(
+        self,
+        period: int,
+        kernel_size: int = 5,
+        stride: int = 3,
+        use_spectral_norm: bool = False,
+    ):
+        super(DiscriminatorP, self).__init__()
+        self.LRELU_SLOPE = 0.1
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if not use_spectral_norm else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, self.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        self.LRELU_SLOPE = 0.1
+        norm_f = spectral_norm if use_spectral_norm else weight_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, self.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11]
+
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class SynthesizerTrn(nn.Module):
+    """
+    Synthesizer for Training
+    """
+
+    def __init__(
+        self,
+        n_vocab: int,
+        spec_channels: int,
+        segment_size: int,
+        inter_channels: int,
+        hidden_channels: int,
+        filter_channels: int,
+        n_heads: int,
+        n_layers: int,
+        kernel_size: int,
+        p_dropout: float,
+        resblock: str,
+        resblock_kernel_sizes: typing.Tuple[int, ...],
+        resblock_dilation_sizes: typing.Tuple[typing.Tuple[int, ...], ...],
+        upsample_rates: typing.Tuple[int, ...],
+        upsample_initial_channel: int,
+        upsample_kernel_sizes: typing.Tuple[int, ...],
+        n_speakers: int = 1,
+        gin_channels: int = 0,
+        use_sdp: bool = True,
+    ):
+
+        super().__init__()
+        self.n_vocab = n_vocab
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.n_speakers = n_speakers
+        self.gin_channels = gin_channels
+
+        self.use_sdp = use_sdp
+
+        self.enc_p = TextEncoder(
+            n_vocab,
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+        )
+        self.dec = Generator(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels
+        )
+
+        if use_sdp:
+            self.dp = StochasticDurationPredictor(
+                hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
+            )
+        else:
+            self.dp = DurationPredictor(
+                hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
+            )
+
+        if n_speakers > 1:
+            self.emb_g = nn.Embedding(n_speakers, gin_channels)
+
+    def forward(self, x, x_lengths, y, y_lengths, sid=None):
+
+        x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
+        if self.n_speakers > 1:
+            g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
+        else:
+            g = None
+
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+
+        with torch.no_grad():
+            # negative cross-entropy
+            s_p_sq_r = torch.exp(-2 * logs_p)  # [b, d, t]
+            neg_cent1 = torch.sum(
+                -0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True
+            )  # [b, 1, t_s]
+            neg_cent2 = torch.matmul(
+                -0.5 * (z_p**2).transpose(1, 2), s_p_sq_r
+            )  # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
+            neg_cent3 = torch.matmul(
+                z_p.transpose(1, 2), (m_p * s_p_sq_r)
+            )  # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
+            neg_cent4 = torch.sum(
+                -0.5 * (m_p**2) * s_p_sq_r, [1], keepdim=True
+            )  # [b, 1, t_s]
+            neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
+
+            attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
+            attn = (
+                monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1))
+                .unsqueeze(1)
+                .detach()
+            )
+
+        w = attn.sum(2)
+        if self.use_sdp:
+            l_length = self.dp(x, x_mask, w, g=g)
+            l_length = l_length / torch.sum(x_mask)
+        else:
+            logw_ = torch.log(w + 1e-6) * x_mask
+            logw = self.dp(x, x_mask, g=g)
+            l_length = torch.sum((logw - logw_) ** 2, [1, 2]) / torch.sum(
+                x_mask
+            )  # for averaging
+
+        # expand prior
+        m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
+        logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
+
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        o = self.dec(z_slice, g=g)
+        return (
+            o,
+            l_length,
+            attn,
+            ids_slice,
+            x_mask,
+            y_mask,
+            (z, z_p, m_p, logs_p, m_q, logs_q),
+        )
+
+    def infer(
+        self,
+        x,
+        x_lengths,
+        sid=None,
+        noise_scale=0.667,
+        length_scale=1,
+        noise_scale_w=0.8,
+        max_len=None,
+    ):
+        x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
+        if self.n_speakers > 1:
+            assert sid is not None, "Missing speaker id"
+            g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
+        else:
+            g = None
+
+        if self.use_sdp:
+            logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
+        else:
+            logw = self.dp(x, x_mask, g=g)
+        w = torch.exp(logw) * x_mask * length_scale
+        w_ceil = torch.ceil(w)
+        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+        y_mask = torch.unsqueeze(
+            commons.sequence_mask(y_lengths, y_lengths.max()), 1
+        ).type_as(x_mask)
+        attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
+        attn = commons.generate_path(w_ceil, attn_mask)
+
+        m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
+            1, 2
+        )  # [b, t', t], [b, t, d] -> [b, d, t']
+        logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
+            1, 2
+        )  # [b, t', t], [b, t, d] -> [b, d, t']
+
+        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
+        z = self.flow(z_p, y_mask, g=g, reverse=True)
+        o = self.dec((z * y_mask)[:, :, :max_len], g=g)
+
+        return o, attn, y_mask, (z, z_p, m_p, logs_p)
+
+    def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
+        assert self.n_speakers > 1, "n_speakers have to be larger than 1."
+        g_src = self.emb_g(sid_src).unsqueeze(-1)
+        g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
+        z_p = self.flow(z, y_mask, g=g_src)
+        z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
+        o_hat = self.dec(z_hat * y_mask, g=g_tgt)
+        return o_hat, y_mask, (z, z_p, z_hat)
--- a/mlu_370-piper/piper/src/python/piper_train/vits/modules.py
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/modules.py
@@ -0,0 +1,527 @@
+import math
+import typing
+
+import torch
+from torch import nn
+from torch.nn import Conv1d
+from torch.nn import functional as F
+from torch.nn.utils import remove_weight_norm, weight_norm
+
+from .commons import fused_add_tanh_sigmoid_multiply, get_padding, init_weights
+from .transforms import piecewise_rational_quadratic_transform
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, channels: int, eps: float = 1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+
+
+class ConvReluNorm(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        hidden_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        n_layers: int,
+        p_dropout: float,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        assert n_layers > 1, "Number of layers should be larger than 0."
+
+        self.conv_layers = nn.ModuleList()
+        self.norm_layers = nn.ModuleList()
+        self.conv_layers.append(
+            nn.Conv1d(
+                in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
+            )
+        )
+        self.norm_layers.append(LayerNorm(hidden_channels))
+        self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
+        for _ in range(n_layers - 1):
+            self.conv_layers.append(
+                nn.Conv1d(
+                    hidden_channels,
+                    hidden_channels,
+                    kernel_size,
+                    padding=kernel_size // 2,
+                )
+            )
+            self.norm_layers.append(LayerNorm(hidden_channels))
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+
+    def forward(self, x, x_mask):
+        x_org = x
+        for i in range(self.n_layers):
+            x = self.conv_layers[i](x * x_mask)
+            x = self.norm_layers[i](x)
+            x = self.relu_drop(x)
+        x = x_org + self.proj(x)
+        return x * x_mask
+
+
+class DDSConv(nn.Module):
+    """
+    Dialted and Depth-Separable Convolution
+    """
+
+    def __init__(
+        self, channels: int, kernel_size: int, n_layers: int, p_dropout: float = 0.0
+    ):
+        super().__init__()
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+
+        self.drop = nn.Dropout(p_dropout)
+        self.convs_sep = nn.ModuleList()
+        self.convs_1x1 = nn.ModuleList()
+        self.norms_1 = nn.ModuleList()
+        self.norms_2 = nn.ModuleList()
+        for i in range(n_layers):
+            dilation = kernel_size**i
+            padding = (kernel_size * dilation - dilation) // 2
+            self.convs_sep.append(
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    groups=channels,
+                    dilation=dilation,
+                    padding=padding,
+                )
+            )
+            self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+            self.norms_1.append(LayerNorm(channels))
+            self.norms_2.append(LayerNorm(channels))
+
+    def forward(self, x, x_mask, g=None):
+        if g is not None:
+            x = x + g
+        for i in range(self.n_layers):
+            y = self.convs_sep[i](x * x_mask)
+            y = self.norms_1[i](y)
+            y = F.gelu(y)
+            y = self.convs_1x1[i](y)
+            y = self.norms_2[i](y)
+            y = F.gelu(y)
+            y = self.drop(y)
+            x = x + y
+        return x * x_mask
+
+
+class WN(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_channels: int,
+        kernel_size: int,
+        dilation_rate: int,
+        n_layers: int,
+        gin_channels: int = 0,
+        p_dropout: float = 0,
+    ):
+        super().__init__()
+        assert kernel_size % 2 == 1
+        self.hidden_channels = hidden_channels
+        self.kernel_size = (kernel_size,)
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.p_dropout = p_dropout
+
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+
+        if gin_channels != 0:
+            cond_layer = torch.nn.Conv1d(
+                gin_channels, 2 * hidden_channels * n_layers, 1
+            )
+            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
+
+        for i in range(n_layers):
+            dilation = dilation_rate**i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = torch.nn.Conv1d(
+                hidden_channels,
+                2 * hidden_channels,
+                kernel_size,
+                dilation=dilation,
+                padding=padding,
+            )
+            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+
+            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
+            self.res_skip_layers.append(res_skip_layer)
+
+    def forward(self, x, x_mask, g=None, **kwargs):
+        output = torch.zeros_like(x)
+        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+        if g is not None:
+            g = self.cond_layer(g)
+
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+            if g is not None:
+                cond_offset = i * 2 * self.hidden_channels
+                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
+            else:
+                g_l = torch.zeros_like(x_in)
+
+            acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
+            acts = self.drop(acts)
+
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                res_acts = res_skip_acts[:, : self.hidden_channels, :]
+                x = (x + res_acts) * x_mask
+                output = output + res_skip_acts[:, self.hidden_channels :, :]
+            else:
+                output = output + res_skip_acts
+        return output * x_mask
+
+    def remove_weight_norm(self):
+        if self.gin_channels != 0:
+            torch.nn.utils.remove_weight_norm(self.cond_layer)
+        for l in self.in_layers:
+            torch.nn.utils.remove_weight_norm(l)
+        for l in self.res_skip_layers:
+            torch.nn.utils.remove_weight_norm(l)
+
+
+class ResBlock1(torch.nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        kernel_size: int = 3,
+        dilation: typing.Tuple[int] = (1, 3, 5),
+    ):
+        super(ResBlock1, self).__init__()
+        self.LRELU_SLOPE = 0.1
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, self.LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, self.LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c2(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+    def __init__(
+        self, channels: int, kernel_size: int = 3, dilation: typing.Tuple[int] = (1, 3)
+    ):
+        super(ResBlock2, self).__init__()
+        self.LRELU_SLOPE = 0.1
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+            ]
+        )
+        self.convs.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c in self.convs:
+            xt = F.leaky_relu(x, self.LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class Log(nn.Module):
+    def forward(
+        self, x: torch.Tensor, x_mask: torch.Tensor, reverse: bool = False, **kwargs
+    ):
+        if not reverse:
+            y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+            logdet = torch.sum(-y, [1, 2])
+            return y, logdet
+        else:
+            x = torch.exp(x) * x_mask
+            return x
+
+
+class Flip(nn.Module):
+    def forward(self, x: torch.Tensor, *args, reverse: bool = False, **kwargs):
+        x = torch.flip(x, [1])
+        if not reverse:
+            logdet = torch.zeros(x.size(0)).type_as(x)
+            return x, logdet
+        else:
+            return x
+
+
+class ElementwiseAffine(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.channels = channels
+        self.m = nn.Parameter(torch.zeros(channels, 1))
+        self.logs = nn.Parameter(torch.zeros(channels, 1))
+
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = self.m + torch.exp(self.logs) * x
+            y = y * x_mask
+            logdet = torch.sum(self.logs * x_mask, [1, 2])
+            return y, logdet
+        else:
+            x = (x - self.m) * torch.exp(-self.logs) * x_mask
+            return x
+
+
+class ResidualCouplingLayer(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        hidden_channels: int,
+        kernel_size: int,
+        dilation_rate: int,
+        n_layers: int,
+        p_dropout: float = 0,
+        gin_channels: int = 0,
+        mean_only: bool = False,
+    ):
+        assert channels % 2 == 0, "channels should be divisible by 2"
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        self.enc = WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            p_dropout=p_dropout,
+            gin_channels=gin_channels,
+        )
+        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0) * x_mask
+        h = self.enc(h, x_mask, g=g)
+        stats = self.post(h) * x_mask
+        if not self.mean_only:
+            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+
+        if not reverse:
+            x1 = m + x1 * torch.exp(logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            logdet = torch.sum(logs, [1, 2])
+            return x, logdet
+        else:
+            x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            return x
+
+
+class ConvFlow(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        filter_channels: int,
+        kernel_size: int,
+        n_layers: int,
+        num_bins: int = 10,
+        tail_bound: float = 5.0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.num_bins = num_bins
+        self.tail_bound = tail_bound
+        self.half_channels = in_channels // 2
+
+        self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
+        self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
+        self.proj = nn.Conv1d(
+            filter_channels, self.half_channels * (num_bins * 3 - 1), 1
+        )
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0)
+        h = self.convs(h, x_mask, g=g)
+        h = self.proj(h) * x_mask
+
+        b, c, t = x0.shape
+        h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2)  # [b, cx?, t] -> [b, c, t, ?]
+
+        unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
+        unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
+            self.filter_channels
+        )
+        unnormalized_derivatives = h[..., 2 * self.num_bins :]
+
+        x1, logabsdet = piecewise_rational_quadratic_transform(
+            x1,
+            unnormalized_widths,
+            unnormalized_heights,
+            unnormalized_derivatives,
+            inverse=reverse,
+            tails="linear",
+            tail_bound=self.tail_bound,
+        )
+
+        x = torch.cat([x0, x1], 1) * x_mask
+
+        logdet = torch.sum(logabsdet * x_mask, [1, 2])
+        if not reverse:
+            return x, logdet
+        else:
+            return x
--- a/mlu_370-piper/piper/src/python/piper_train/vits/monotonic_align/Makefile
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/monotonic_align/Makefile
@@ -0,0 +1,2 @@
+all:
+	python3 setup.py build_ext --inplace
--- a/mlu_370-piper/piper/src/python/piper_train/vits/monotonic_align/init.py
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/monotonic_align/init.py
@@ -0,0 +1,20 @@
+import numpy as np
+import torch
+
+from .monotonic_align.core import maximum_path_c
+
+
+def maximum_path(neg_cent, mask):
+    """Cython optimized version.
+    neg_cent: [b, t_t, t_s]
+    mask: [b, t_t, t_s]
+    """
+    device = neg_cent.device
+    dtype = neg_cent.dtype
+    neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
+    path = np.zeros(neg_cent.shape, dtype=np.int32)
+
+    t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
+    t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
+    maximum_path_c(path, neg_cent, t_t_max, t_s_max)
+    return torch.from_numpy(path).to(device=device, dtype=dtype)
--- a/mlu_370-piper/piper/src/python/piper_train/vits/monotonic_align/core.c
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/monotonic_align/core.c
--- a/mlu_370-piper/piper/src/python/piper_train/vits/monotonic_align/core.pyx
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/monotonic_align/core.pyx
@@ -0,0 +1,42 @@
+cimport cython
+from cython.parallel import prange
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
+  cdef int x
+  cdef int y
+  cdef float v_prev
+  cdef float v_cur
+  cdef float tmp
+  cdef int index = t_x - 1
+
+  for y in range(t_y):
+    for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
+      if x == y:
+        v_cur = max_neg_val
+      else:
+        v_cur = value[y-1, x]
+      if x == 0:
+        if y == 0:
+          v_prev = 0.
+        else:
+          v_prev = max_neg_val
+      else:
+        v_prev = value[y-1, x-1]
+      value[y, x] += max(v_prev, v_cur)
+
+  for y in range(t_y - 1, -1, -1):
+    path[y, index] = 1
+    if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
+      index = index - 1
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
+  cdef int b = paths.shape[0]
+  cdef int i
+  for i in prange(b, nogil=True):
+    maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])
--- a/mlu_370-piper/piper/src/python/piper_train/vits/monotonic_align/setup.py
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/monotonic_align/setup.py
@@ -0,0 +1,13 @@
+from distutils.core import setup
+from pathlib import Path
+
+import numpy
+from Cython.Build import cythonize
+
+_DIR = Path(__file__).parent
+
+setup(
+    name="monotonic_align",
+    ext_modules=cythonize(str(_DIR / "core.pyx")),
+    include_dirs=[numpy.get_include()],
+)
--- a/mlu_370-piper/piper/src/python/piper_train/vits/transforms.py
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/transforms.py
@@ -0,0 +1,212 @@
+import numpy as np
+import torch
+from torch.nn import functional as F
+
+DEFAULT_MIN_BIN_WIDTH = 1e-3
+DEFAULT_MIN_BIN_HEIGHT = 1e-3
+DEFAULT_MIN_DERIVATIVE = 1e-3
+
+
+def piecewise_rational_quadratic_transform(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    inverse=False,
+    tails=None,
+    tail_bound=1.0,
+    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+
+    if tails is None:
+        spline_fn = rational_quadratic_spline
+        spline_kwargs = {}
+    else:
+        spline_fn = unconstrained_rational_quadratic_spline
+        spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
+
+    outputs, logabsdet = spline_fn(
+        inputs=inputs,
+        unnormalized_widths=unnormalized_widths,
+        unnormalized_heights=unnormalized_heights,
+        unnormalized_derivatives=unnormalized_derivatives,
+        inverse=inverse,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative,
+        **spline_kwargs
+    )
+    return outputs, logabsdet
+
+
+def searchsorted(bin_locations, inputs, eps=1e-6):
+    # bin_locations[..., -1] += eps
+    bin_locations[..., bin_locations.size(-1) - 1] += eps
+    return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
+
+
+def unconstrained_rational_quadratic_spline(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    inverse=False,
+    tails="linear",
+    tail_bound=1.0,
+    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
+    outside_interval_mask = ~inside_interval_mask
+
+    outputs = torch.zeros_like(inputs)
+    logabsdet = torch.zeros_like(inputs)
+
+    if tails == "linear":
+        unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
+        constant = np.log(np.exp(1 - min_derivative) - 1)
+        unnormalized_derivatives[..., 0] = constant
+        # unnormalized_derivatives[..., -1] = constant
+        unnormalized_derivatives[..., unnormalized_derivatives.size(-1) - 1] = constant
+
+        outputs[outside_interval_mask] = inputs[outside_interval_mask]
+        logabsdet[outside_interval_mask] = 0
+    else:
+        raise RuntimeError("{} tails are not implemented.".format(tails))
+
+    (
+        outputs[inside_interval_mask],
+        logabsdet[inside_interval_mask],
+    ) = rational_quadratic_spline(
+        inputs=inputs[inside_interval_mask],
+        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
+        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
+        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
+        inverse=inverse,
+        left=-tail_bound,
+        right=tail_bound,
+        bottom=-tail_bound,
+        top=tail_bound,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative,
+    )
+
+    return outputs, logabsdet
+
+
+def rational_quadratic_spline(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    inverse=False,
+    left=0.0,
+    right=1.0,
+    bottom=0.0,
+    top=1.0,
+    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+    # if torch.min(inputs) < left or torch.max(inputs) > right:
+    #     raise ValueError("Input to a transform is not within its domain")
+
+    num_bins = unnormalized_widths.shape[-1]
+
+    # if min_bin_width * num_bins > 1.0:
+    #     raise ValueError("Minimal bin width too large for the number of bins")
+    # if min_bin_height * num_bins > 1.0:
+    #     raise ValueError("Minimal bin height too large for the number of bins")
+
+    widths = F.softmax(unnormalized_widths, dim=-1)
+    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
+    cumwidths = torch.cumsum(widths, dim=-1)
+    cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
+    cumwidths = (right - left) * cumwidths + left
+    cumwidths[..., 0] = left
+    # cumwidths[..., -1] = right
+    cumwidths[..., cumwidths.size(-1) - 1] = right
+    widths = cumwidths[..., 1:] - cumwidths[..., :-1]
+
+    derivatives = min_derivative + F.softplus(unnormalized_derivatives)
+
+    heights = F.softmax(unnormalized_heights, dim=-1)
+    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
+    cumheights = torch.cumsum(heights, dim=-1)
+    cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
+    cumheights = (top - bottom) * cumheights + bottom
+    cumheights[..., 0] = bottom
+    # cumheights[..., -1] = top
+    cumheights[..., cumheights.size(-1) - 1] = top
+    heights = cumheights[..., 1:] - cumheights[..., :-1]
+
+    if inverse:
+        bin_idx = searchsorted(cumheights, inputs)[..., None]
+    else:
+        bin_idx = searchsorted(cumwidths, inputs)[..., None]
+
+    input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
+    input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
+
+    input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
+    delta = heights / widths
+    input_delta = delta.gather(-1, bin_idx)[..., 0]
+
+    input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
+    input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
+
+    input_heights = heights.gather(-1, bin_idx)[..., 0]
+
+    if inverse:
+        a = (inputs - input_cumheights) * (
+            input_derivatives + input_derivatives_plus_one - 2 * input_delta
+        ) + input_heights * (input_delta - input_derivatives)
+        b = input_heights * input_derivatives - (inputs - input_cumheights) * (
+            input_derivatives + input_derivatives_plus_one - 2 * input_delta
+        )
+        c = -input_delta * (inputs - input_cumheights)
+
+        discriminant = b.pow(2) - 4 * a * c
+        assert (discriminant >= 0).all(), discriminant
+
+        root = (2 * c) / (-b - torch.sqrt(discriminant))
+        outputs = root * input_bin_widths + input_cumwidths
+
+        theta_one_minus_theta = root * (1 - root)
+        denominator = input_delta + (
+            (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+            * theta_one_minus_theta
+        )
+        derivative_numerator = input_delta.pow(2) * (
+            input_derivatives_plus_one * root.pow(2)
+            + 2 * input_delta * theta_one_minus_theta
+            + input_derivatives * (1 - root).pow(2)
+        )
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+        return outputs, -logabsdet
+
+    theta = (inputs - input_cumwidths) / input_bin_widths
+    theta_one_minus_theta = theta * (1 - theta)
+
+    numerator = input_heights * (
+        input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
+    )
+    denominator = input_delta + (
+        (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+        * theta_one_minus_theta
+    )
+    outputs = input_cumheights + numerator / denominator
+
+    derivative_numerator = input_delta.pow(2) * (
+        input_derivatives_plus_one * theta.pow(2)
+        + 2 * input_delta * theta_one_minus_theta
+        + input_derivatives * (1 - theta).pow(2)
+    )
+    logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+    return outputs, logabsdet
--- a/mlu_370-piper/piper/src/python/piper_train/vits/utils.py
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/utils.py
@@ -0,0 +1,16 @@
+import numpy as np
+import torch
+
+
+def to_gpu(x: torch.Tensor) -> torch.Tensor:
+    return x.contiguous().cuda(non_blocking=True)
+
+
+def audio_float_to_int16(
+    audio: np.ndarray, max_wav_value: float = 32767.0
+) -> np.ndarray:
+    """Normalize audio and convert to int16 range"""
+    audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
+    audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
+    audio_norm = audio_norm.astype("int16")
+    return audio_norm
--- a/mlu_370-piper/piper/src/python/piper_train/vits/wavfile.py
+++ b/mlu_370-piper/piper/src/python/piper_train/vits/wavfile.py
@@ -0,0 +1,860 @@
+"""
+Module to read / write wav files using NumPy arrays
+
+Functions
+---------
+`read`: Return the sample rate (in samples/sec) and data from a WAV file.
+
+`write`: Write a NumPy array as a WAV file.
+
+"""
+import io
+import struct
+import sys
+import warnings
+from enum import IntEnum
+
+import numpy
+
+__all__ = ["WavFileWarning", "read", "write"]
+
+
+class WavFileWarning(UserWarning):
+    pass
+
+
+class WAVE_FORMAT(IntEnum):
+    """
+    WAVE form wFormatTag IDs
+
+    Complete list is in mmreg.h in Windows 10 SDK.  ALAC and OPUS are the
+    newest additions, in v10.0.14393 2016-07
+    """
+
+    UNKNOWN = 0x0000
+    PCM = 0x0001
+    ADPCM = 0x0002
+    IEEE_FLOAT = 0x0003
+    VSELP = 0x0004
+    IBM_CVSD = 0x0005
+    ALAW = 0x0006
+    MULAW = 0x0007
+    DTS = 0x0008
+    DRM = 0x0009
+    WMAVOICE9 = 0x000A
+    WMAVOICE10 = 0x000B
+    OKI_ADPCM = 0x0010
+    DVI_ADPCM = 0x0011
+    IMA_ADPCM = 0x0011  # Duplicate
+    MEDIASPACE_ADPCM = 0x0012
+    SIERRA_ADPCM = 0x0013
+    G723_ADPCM = 0x0014
+    DIGISTD = 0x0015
+    DIGIFIX = 0x0016
+    DIALOGIC_OKI_ADPCM = 0x0017
+    MEDIAVISION_ADPCM = 0x0018
+    CU_CODEC = 0x0019
+    HP_DYN_VOICE = 0x001A
+    YAMAHA_ADPCM = 0x0020
+    SONARC = 0x0021
+    DSPGROUP_TRUESPEECH = 0x0022
+    ECHOSC1 = 0x0023
+    AUDIOFILE_AF36 = 0x0024
+    APTX = 0x0025
+    AUDIOFILE_AF10 = 0x0026
+    PROSODY_1612 = 0x0027
+    LRC = 0x0028
+    DOLBY_AC2 = 0x0030
+    GSM610 = 0x0031
+    MSNAUDIO = 0x0032
+    ANTEX_ADPCME = 0x0033
+    CONTROL_RES_VQLPC = 0x0034
+    DIGIREAL = 0x0035
+    DIGIADPCM = 0x0036
+    CONTROL_RES_CR10 = 0x0037
+    NMS_VBXADPCM = 0x0038
+    CS_IMAADPCM = 0x0039
+    ECHOSC3 = 0x003A
+    ROCKWELL_ADPCM = 0x003B
+    ROCKWELL_DIGITALK = 0x003C
+    XEBEC = 0x003D
+    G721_ADPCM = 0x0040
+    G728_CELP = 0x0041
+    MSG723 = 0x0042
+    INTEL_G723_1 = 0x0043
+    INTEL_G729 = 0x0044
+    SHARP_G726 = 0x0045
+    MPEG = 0x0050
+    RT24 = 0x0052
+    PAC = 0x0053
+    MPEGLAYER3 = 0x0055
+    LUCENT_G723 = 0x0059
+    CIRRUS = 0x0060
+    ESPCM = 0x0061
+    VOXWARE = 0x0062
+    CANOPUS_ATRAC = 0x0063
+    G726_ADPCM = 0x0064
+    G722_ADPCM = 0x0065
+    DSAT = 0x0066
+    DSAT_DISPLAY = 0x0067
+    VOXWARE_BYTE_ALIGNED = 0x0069
+    VOXWARE_AC8 = 0x0070
+    VOXWARE_AC10 = 0x0071
+    VOXWARE_AC16 = 0x0072
+    VOXWARE_AC20 = 0x0073
+    VOXWARE_RT24 = 0x0074
+    VOXWARE_RT29 = 0x0075
+    VOXWARE_RT29HW = 0x0076
+    VOXWARE_VR12 = 0x0077
+    VOXWARE_VR18 = 0x0078
+    VOXWARE_TQ40 = 0x0079
+    VOXWARE_SC3 = 0x007A
+    VOXWARE_SC3_1 = 0x007B
+    SOFTSOUND = 0x0080
+    VOXWARE_TQ60 = 0x0081
+    MSRT24 = 0x0082
+    G729A = 0x0083
+    MVI_MVI2 = 0x0084
+    DF_G726 = 0x0085
+    DF_GSM610 = 0x0086
+    ISIAUDIO = 0x0088
+    ONLIVE = 0x0089
+    MULTITUDE_FT_SX20 = 0x008A
+    INFOCOM_ITS_G721_ADPCM = 0x008B
+    CONVEDIA_G729 = 0x008C
+    CONGRUENCY = 0x008D
+    SBC24 = 0x0091
+    DOLBY_AC3_SPDIF = 0x0092
+    MEDIASONIC_G723 = 0x0093
+    PROSODY_8KBPS = 0x0094
+    ZYXEL_ADPCM = 0x0097
+    PHILIPS_LPCBB = 0x0098
+    PACKED = 0x0099
+    MALDEN_PHONYTALK = 0x00A0
+    RACAL_RECORDER_GSM = 0x00A1
+    RACAL_RECORDER_G720_A = 0x00A2
+    RACAL_RECORDER_G723_1 = 0x00A3
+    RACAL_RECORDER_TETRA_ACELP = 0x00A4
+    NEC_AAC = 0x00B0
+    RAW_AAC1 = 0x00FF
+    RHETOREX_ADPCM = 0x0100
+    IRAT = 0x0101
+    VIVO_G723 = 0x0111
+    VIVO_SIREN = 0x0112
+    PHILIPS_CELP = 0x0120
+    PHILIPS_GRUNDIG = 0x0121
+    DIGITAL_G723 = 0x0123
+    SANYO_LD_ADPCM = 0x0125
+    SIPROLAB_ACEPLNET = 0x0130
+    SIPROLAB_ACELP4800 = 0x0131
+    SIPROLAB_ACELP8V3 = 0x0132
+    SIPROLAB_G729 = 0x0133
+    SIPROLAB_G729A = 0x0134
+    SIPROLAB_KELVIN = 0x0135
+    VOICEAGE_AMR = 0x0136
+    G726ADPCM = 0x0140
+    DICTAPHONE_CELP68 = 0x0141
+    DICTAPHONE_CELP54 = 0x0142
+    QUALCOMM_PUREVOICE = 0x0150
+    QUALCOMM_HALFRATE = 0x0151
+    TUBGSM = 0x0155
+    MSAUDIO1 = 0x0160
+    WMAUDIO2 = 0x0161
+    WMAUDIO3 = 0x0162
+    WMAUDIO_LOSSLESS = 0x0163
+    WMASPDIF = 0x0164
+    UNISYS_NAP_ADPCM = 0x0170
+    UNISYS_NAP_ULAW = 0x0171
+    UNISYS_NAP_ALAW = 0x0172
+    UNISYS_NAP_16K = 0x0173
+    SYCOM_ACM_SYC008 = 0x0174
+    SYCOM_ACM_SYC701_G726L = 0x0175
+    SYCOM_ACM_SYC701_CELP54 = 0x0176
+    SYCOM_ACM_SYC701_CELP68 = 0x0177
+    KNOWLEDGE_ADVENTURE_ADPCM = 0x0178
+    FRAUNHOFER_IIS_MPEG2_AAC = 0x0180
+    DTS_DS = 0x0190
+    CREATIVE_ADPCM = 0x0200
+    CREATIVE_FASTSPEECH8 = 0x0202
+    CREATIVE_FASTSPEECH10 = 0x0203
+    UHER_ADPCM = 0x0210
+    ULEAD_DV_AUDIO = 0x0215
+    ULEAD_DV_AUDIO_1 = 0x0216
+    QUARTERDECK = 0x0220
+    ILINK_VC = 0x0230
+    RAW_SPORT = 0x0240
+    ESST_AC3 = 0x0241
+    GENERIC_PASSTHRU = 0x0249
+    IPI_HSX = 0x0250
+    IPI_RPELP = 0x0251
+    CS2 = 0x0260
+    SONY_SCX = 0x0270
+    SONY_SCY = 0x0271
+    SONY_ATRAC3 = 0x0272
+    SONY_SPC = 0x0273
+    TELUM_AUDIO = 0x0280
+    TELUM_IA_AUDIO = 0x0281
+    NORCOM_VOICE_SYSTEMS_ADPCM = 0x0285
+    FM_TOWNS_SND = 0x0300
+    MICRONAS = 0x0350
+    MICRONAS_CELP833 = 0x0351
+    BTV_DIGITAL = 0x0400
+    INTEL_MUSIC_CODER = 0x0401
+    INDEO_AUDIO = 0x0402
+    QDESIGN_MUSIC = 0x0450
+    ON2_VP7_AUDIO = 0x0500
+    ON2_VP6_AUDIO = 0x0501
+    VME_VMPCM = 0x0680
+    TPC = 0x0681
+    LIGHTWAVE_LOSSLESS = 0x08AE
+    OLIGSM = 0x1000
+    OLIADPCM = 0x1001
+    OLICELP = 0x1002
+    OLISBC = 0x1003
+    OLIOPR = 0x1004
+    LH_CODEC = 0x1100
+    LH_CODEC_CELP = 0x1101
+    LH_CODEC_SBC8 = 0x1102
+    LH_CODEC_SBC12 = 0x1103
+    LH_CODEC_SBC16 = 0x1104
+    NORRIS = 0x1400
+    ISIAUDIO_2 = 0x1401
+    SOUNDSPACE_MUSICOMPRESS = 0x1500
+    MPEG_ADTS_AAC = 0x1600
+    MPEG_RAW_AAC = 0x1601
+    MPEG_LOAS = 0x1602
+    NOKIA_MPEG_ADTS_AAC = 0x1608
+    NOKIA_MPEG_RAW_AAC = 0x1609
+    VODAFONE_MPEG_ADTS_AAC = 0x160A
+    VODAFONE_MPEG_RAW_AAC = 0x160B
+    MPEG_HEAAC = 0x1610
+    VOXWARE_RT24_SPEECH = 0x181C
+    SONICFOUNDRY_LOSSLESS = 0x1971
+    INNINGS_TELECOM_ADPCM = 0x1979
+    LUCENT_SX8300P = 0x1C07
+    LUCENT_SX5363S = 0x1C0C
+    CUSEEME = 0x1F03
+    NTCSOFT_ALF2CM_ACM = 0x1FC4
+    DVM = 0x2000
+    DTS2 = 0x2001
+    MAKEAVIS = 0x3313
+    DIVIO_MPEG4_AAC = 0x4143
+    NOKIA_ADAPTIVE_MULTIRATE = 0x4201
+    DIVIO_G726 = 0x4243
+    LEAD_SPEECH = 0x434C
+    LEAD_VORBIS = 0x564C
+    WAVPACK_AUDIO = 0x5756
+    OGG_VORBIS_MODE_1 = 0x674F
+    OGG_VORBIS_MODE_2 = 0x6750
+    OGG_VORBIS_MODE_3 = 0x6751
+    OGG_VORBIS_MODE_1_PLUS = 0x676F
+    OGG_VORBIS_MODE_2_PLUS = 0x6770
+    OGG_VORBIS_MODE_3_PLUS = 0x6771
+    ALAC = 0x6C61
+    _3COM_NBX = 0x7000  # Can't have leading digit
+    OPUS = 0x704F
+    FAAD_AAC = 0x706D
+    AMR_NB = 0x7361
+    AMR_WB = 0x7362
+    AMR_WP = 0x7363
+    GSM_AMR_CBR = 0x7A21
+    GSM_AMR_VBR_SID = 0x7A22
+    COMVERSE_INFOSYS_G723_1 = 0xA100
+    COMVERSE_INFOSYS_AVQSBC = 0xA101
+    COMVERSE_INFOSYS_SBC = 0xA102
+    SYMBOL_G729_A = 0xA103
+    VOICEAGE_AMR_WB = 0xA104
+    INGENIENT_G726 = 0xA105
+    MPEG4_AAC = 0xA106
+    ENCORE_G726 = 0xA107
+    ZOLL_ASAO = 0xA108
+    SPEEX_VOICE = 0xA109
+    VIANIX_MASC = 0xA10A
+    WM9_SPECTRUM_ANALYZER = 0xA10B
+    WMF_SPECTRUM_ANAYZER = 0xA10C
+    GSM_610 = 0xA10D
+    GSM_620 = 0xA10E
+    GSM_660 = 0xA10F
+    GSM_690 = 0xA110
+    GSM_ADAPTIVE_MULTIRATE_WB = 0xA111
+    POLYCOM_G722 = 0xA112
+    POLYCOM_G728 = 0xA113
+    POLYCOM_G729_A = 0xA114
+    POLYCOM_SIREN = 0xA115
+    GLOBAL_IP_ILBC = 0xA116
+    RADIOTIME_TIME_SHIFT_RADIO = 0xA117
+    NICE_ACA = 0xA118
+    NICE_ADPCM = 0xA119
+    VOCORD_G721 = 0xA11A
+    VOCORD_G726 = 0xA11B
+    VOCORD_G722_1 = 0xA11C
+    VOCORD_G728 = 0xA11D
+    VOCORD_G729 = 0xA11E
+    VOCORD_G729_A = 0xA11F
+    VOCORD_G723_1 = 0xA120
+    VOCORD_LBC = 0xA121
+    NICE_G728 = 0xA122
+    FRACE_TELECOM_G729 = 0xA123
+    CODIAN = 0xA124
+    FLAC = 0xF1AC
+    EXTENSIBLE = 0xFFFE
+    DEVELOPMENT = 0xFFFF
+
+
+KNOWN_WAVE_FORMATS = {WAVE_FORMAT.PCM, WAVE_FORMAT.IEEE_FLOAT}
+
+
+def _raise_bad_format(format_tag):
+    try:
+        format_name = WAVE_FORMAT(format_tag).name
+    except ValueError:
+        format_name = f"{format_tag:#06x}"
+    raise ValueError(
+        f"Unknown wave file format: {format_name}. Supported "
+        "formats: " + ", ".join(x.name for x in KNOWN_WAVE_FORMATS)
+    )
+
+
+def _read_fmt_chunk(fid, is_big_endian):
+    """
+    Returns
+    -------
+    size : int
+        size of format subchunk in bytes (minus 8 for "fmt " and itself)
+    format_tag : int
+        PCM, float, or compressed format
+    channels : int
+        number of channels
+    fs : int
+        sampling frequency in samples per second
+    bytes_per_second : int
+        overall byte rate for the file
+    block_align : int
+        bytes per sample, including all channels
+    bit_depth : int
+        bits per sample
+
+    Notes
+    -----
+    Assumes file pointer is immediately after the 'fmt ' id
+    """
+    if is_big_endian:
+        fmt = ">"
+    else:
+        fmt = "<"
+
+    size = struct.unpack(fmt + "I", fid.read(4))[0]
+
+    if size < 16:
+        raise ValueError("Binary structure of wave file is not compliant")
+
+    res = struct.unpack(fmt + "HHIIHH", fid.read(16))
+    bytes_read = 16
+
+    format_tag, channels, fs, bytes_per_second, block_align, bit_depth = res
+
+    if format_tag == WAVE_FORMAT.EXTENSIBLE and size >= (16 + 2):
+        ext_chunk_size = struct.unpack(fmt + "H", fid.read(2))[0]
+        bytes_read += 2
+        if ext_chunk_size >= 22:
+            extensible_chunk_data = fid.read(22)
+            bytes_read += 22
+            raw_guid = extensible_chunk_data[2 + 4 : 2 + 4 + 16]
+            # GUID template {XXXXXXXX-0000-0010-8000-00AA00389B71} (RFC-2361)
+            # MS GUID byte order: first three groups are native byte order,
+            # rest is Big Endian
+            if is_big_endian:
+                tail = b"\x00\x00\x00\x10\x80\x00\x00\xAA\x00\x38\x9B\x71"
+            else:
+                tail = b"\x00\x00\x10\x00\x80\x00\x00\xAA\x00\x38\x9B\x71"
+            if raw_guid.endswith(tail):
+                format_tag = struct.unpack(fmt + "I", raw_guid[:4])[0]
+        else:
+            raise ValueError("Binary structure of wave file is not compliant")
+
+    if format_tag not in KNOWN_WAVE_FORMATS:
+        _raise_bad_format(format_tag)
+
+    # move file pointer to next chunk
+    if size > bytes_read:
+        fid.read(size - bytes_read)
+
+    # fmt should always be 16, 18 or 40, but handle it just in case
+    _handle_pad_byte(fid, size)
+
+    return (size, format_tag, channels, fs, bytes_per_second, block_align, bit_depth)
+
+
+def _read_data_chunk(
+    fid, format_tag, channels, bit_depth, is_big_endian, block_align, mmap=False
+):
+    """
+    Notes
+    -----
+    Assumes file pointer is immediately after the 'data' id
+
+    It's possible to not use all available bits in a container, or to store
+    samples in a container bigger than necessary, so bytes_per_sample uses
+    the actual reported container size (nBlockAlign / nChannels).  Real-world
+    examples:
+
+    Adobe Audition's "24-bit packed int (type 1, 20-bit)"
+
+        nChannels = 2, nBlockAlign = 6, wBitsPerSample = 20
+
+    http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples/AFsp/M1F1-int12-AFsp.wav
+    is:
+
+        nChannels = 2, nBlockAlign = 4, wBitsPerSample = 12
+
+    http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Docs/multichaudP.pdf
+    gives an example of:
+
+        nChannels = 2, nBlockAlign = 8, wBitsPerSample = 20
+    """
+    if is_big_endian:
+        fmt = ">"
+    else:
+        fmt = "<"
+
+    # Size of the data subchunk in bytes
+    size = struct.unpack(fmt + "I", fid.read(4))[0]
+
+    # Number of bytes per sample (sample container size)
+    bytes_per_sample = block_align // channels
+    n_samples = size // bytes_per_sample
+
+    if format_tag == WAVE_FORMAT.PCM:
+        if 1 <= bit_depth <= 8:
+            dtype = "u1"  # WAV of 8-bit integer or less are unsigned
+        elif bytes_per_sample in {3, 5, 6, 7}:
+            # No compatible dtype.  Load as raw bytes for reshaping later.
+            dtype = "V1"
+        elif bit_depth <= 64:
+            # Remaining bit depths can map directly to signed numpy dtypes
+            dtype = f"{fmt}i{bytes_per_sample}"
+        else:
+            raise ValueError(
+                "Unsupported bit depth: the WAV file "
+                f"has {bit_depth}-bit integer data."
+            )
+    elif format_tag == WAVE_FORMAT.IEEE_FLOAT:
+        if bit_depth in {32, 64}:
+            dtype = f"{fmt}f{bytes_per_sample}"
+        else:
+            raise ValueError(
+                "Unsupported bit depth: the WAV file "
+                f"has {bit_depth}-bit floating-point data."
+            )
+    else:
+        _raise_bad_format(format_tag)
+
+    start = fid.tell()
+    if not mmap:
+        try:
+            count = size if dtype == "V1" else n_samples
+            data = numpy.fromfile(fid, dtype=dtype, count=count)
+        except io.UnsupportedOperation:  # not a C-like file
+            fid.seek(start, 0)  # just in case it seeked, though it shouldn't
+            data = numpy.frombuffer(fid.read(size), dtype=dtype)
+
+        if dtype == "V1":
+            # Rearrange raw bytes into smallest compatible numpy dtype
+            dt = f"{fmt}i4" if bytes_per_sample == 3 else f"{fmt}i8"
+            a = numpy.zeros(
+                (len(data) // bytes_per_sample, numpy.dtype(dt).itemsize), dtype="V1"
+            )
+            if is_big_endian:
+                a[:, :bytes_per_sample] = data.reshape((-1, bytes_per_sample))
+            else:
+                a[:, -bytes_per_sample:] = data.reshape((-1, bytes_per_sample))
+            data = a.view(dt).reshape(a.shape[:-1])
+    else:
+        if bytes_per_sample in {1, 2, 4, 8}:
+            start = fid.tell()
+            data = numpy.memmap(
+                fid, dtype=dtype, mode="c", offset=start, shape=(n_samples,)
+            )
+            fid.seek(start + size)
+        else:
+            raise ValueError(
+                "mmap=True not compatible with "
+                f"{bytes_per_sample}-byte container size."
+            )
+
+    _handle_pad_byte(fid, size)
+
+    if channels > 1:
+        data = data.reshape(-1, channels)
+    return data
+
+
+def _skip_unknown_chunk(fid, is_big_endian):
+    if is_big_endian:
+        fmt = ">I"
+    else:
+        fmt = "<I"
+
+    data = fid.read(4)
+    # call unpack() and seek() only if we have really read data from file
+    # otherwise empty read at the end of the file would trigger
+    # unnecessary exception at unpack() call
+    # in case data equals somehow to 0, there is no need for seek() anyway
+    if data:
+        size = struct.unpack(fmt, data)[0]
+        fid.seek(size, 1)
+        _handle_pad_byte(fid, size)
+
+
+def _read_riff_chunk(fid):
+    str1 = fid.read(4)  # File signature
+    if str1 == b"RIFF":
+        is_big_endian = False
+        fmt = "<I"
+    elif str1 == b"RIFX":
+        is_big_endian = True
+        fmt = ">I"
+    else:
+        # There are also .wav files with "FFIR" or "XFIR" signatures?
+        raise ValueError(
+            f"File format {repr(str1)} not understood. Only "
+            "'RIFF' and 'RIFX' supported."
+        )
+
+    # Size of entire file
+    file_size = struct.unpack(fmt, fid.read(4))[0] + 8
+
+    str2 = fid.read(4)
+    if str2 != b"WAVE":
+        raise ValueError(f"Not a WAV file. RIFF form type is {repr(str2)}.")
+
+    return file_size, is_big_endian
+
+
+def _handle_pad_byte(fid, size):
+    # "If the chunk size is an odd number of bytes, a pad byte with value zero
+    # is written after ckData." So we need to seek past this after each chunk.
+    if size % 2:
+        fid.seek(1, 1)
+
+
+def read(filename, mmap=False):
+    """
+    Open a WAV file.
+
+    Return the sample rate (in samples/sec) and data from an LPCM WAV file.
+
+    Parameters
+    ----------
+    filename : string or open file handle
+        Input WAV file.
+    mmap : bool, optional
+        Whether to read data as memory-mapped (default: False).  Not compatible
+        with some bit depths; see Notes.  Only to be used on real files.
+
+        .. versionadded:: 0.12.0
+
+    Returns
+    -------
+    rate : int
+        Sample rate of WAV file.
+    data : numpy array
+        Data read from WAV file. Data-type is determined from the file;
+        see Notes.  Data is 1-D for 1-channel WAV, or 2-D of shape
+        (Nsamples, Nchannels) otherwise. If a file-like input without a
+        C-like file descriptor (e.g., :class:`python:io.BytesIO`) is
+        passed, this will not be writeable.
+
+    Notes
+    -----
+    Common data types: [1]_
+
+    =====================  ===========  ===========  =============
+         WAV format            Min          Max       NumPy dtype
+    =====================  ===========  ===========  =============
+    32-bit floating-point  -1.0         +1.0         float32
+    32-bit integer PCM     -2147483648  +2147483647  int32
+    24-bit integer PCM     -2147483648  +2147483392  int32
+    16-bit integer PCM     -32768       +32767       int16
+    8-bit integer PCM      0            255          uint8
+    =====================  ===========  ===========  =============
+
+    WAV files can specify arbitrary bit depth, and this function supports
+    reading any integer PCM depth from 1 to 64 bits.  Data is returned in the
+    smallest compatible numpy int type, in left-justified format.  8-bit and
+    lower is unsigned, while 9-bit and higher is signed.
+
+    For example, 24-bit data will be stored as int32, with the MSB of the
+    24-bit data stored at the MSB of the int32, and typically the least
+    significant byte is 0x00.  (However, if a file actually contains data past
+    its specified bit depth, those bits will be read and output, too. [2]_)
+
+    This bit justification and sign matches WAV's native internal format, which
+    allows memory mapping of WAV files that use 1, 2, 4, or 8 bytes per sample
+    (so 24-bit files cannot be memory-mapped, but 32-bit can).
+
+    IEEE float PCM in 32- or 64-bit format is supported, with or without mmap.
+    Values exceeding [-1, +1] are not clipped.
+
+    Non-linear PCM (mu-law, A-law) is not supported.
+
+    References
+    ----------
+    .. [1] IBM Corporation and Microsoft Corporation, "Multimedia Programming
+       Interface and Data Specifications 1.0", section "Data Format of the
+       Samples", August 1991
+       http://www.tactilemedia.com/info/MCI_Control_Info.html
+    .. [2] Adobe Systems Incorporated, "Adobe Audition 3 User Guide", section
+       "Audio file formats: 24-bit Packed Int (type 1, 20-bit)", 2007
+
+    Examples
+    --------
+    >>> from os.path import dirname, join as pjoin
+    >>> from scipy.io import wavfile
+    >>> import scipy.io
+
+    Get the filename for an example .wav file from the tests/data directory.
+
+    >>> data_dir = pjoin(dirname(scipy.io.__file__), 'tests', 'data')
+    >>> wav_fname = pjoin(data_dir, 'test-44100Hz-2ch-32bit-float-be.wav')
+
+    Load the .wav file contents.
+
+    >>> samplerate, data = wavfile.read(wav_fname)
+    >>> print(f"number of channels = {data.shape[1]}")
+    number of channels = 2
+    >>> length = data.shape[0] / samplerate
+    >>> print(f"length = {length}s")
+    length = 0.01s
+
+    Plot the waveform.
+
+    >>> import matplotlib.pyplot as plt
+    >>> import numpy as np
+    >>> time = np.linspace(0., length, data.shape[0])
+    >>> plt.plot(time, data[:, 0], label="Left channel")
+    >>> plt.plot(time, data[:, 1], label="Right channel")
+    >>> plt.legend()
+    >>> plt.xlabel("Time [s]")
+    >>> plt.ylabel("Amplitude")
+    >>> plt.show()
+
+    """
+    if hasattr(filename, "read"):
+        fid = filename
+        mmap = False
+    else:
+        # pylint: disable=consider-using-with
+        fid = open(filename, "rb")
+
+    try:
+        file_size, is_big_endian = _read_riff_chunk(fid)
+        fmt_chunk_received = False
+        data_chunk_received = False
+        while fid.tell() < file_size:
+            # read the next chunk
+            chunk_id = fid.read(4)
+
+            if not chunk_id:
+                if data_chunk_received:
+                    # End of file but data successfully read
+                    warnings.warn(
+                        f"Reached EOF prematurely; finished at {fid.tell()} bytes, "
+                        "expected {file_size} bytes from header.",
+                        WavFileWarning,
+                        stacklevel=2,
+                    )
+                    break
+
+                raise ValueError("Unexpected end of file.")
+            if len(chunk_id) < 4:
+                msg = f"Incomplete chunk ID: {repr(chunk_id)}"
+                # If we have the data, ignore the broken chunk
+                if fmt_chunk_received and data_chunk_received:
+                    warnings.warn(msg + ", ignoring it.", WavFileWarning, stacklevel=2)
+                else:
+                    raise ValueError(msg)
+
+            if chunk_id == b"fmt ":
+                fmt_chunk_received = True
+                fmt_chunk = _read_fmt_chunk(fid, is_big_endian)
+                format_tag, channels, fs = fmt_chunk[1:4]
+                bit_depth = fmt_chunk[6]
+                block_align = fmt_chunk[5]
+            elif chunk_id == b"fact":
+                _skip_unknown_chunk(fid, is_big_endian)
+            elif chunk_id == b"data":
+                data_chunk_received = True
+                if not fmt_chunk_received:
+                    raise ValueError("No fmt chunk before data")
+                data = _read_data_chunk(
+                    fid,
+                    format_tag,
+                    channels,
+                    bit_depth,
+                    is_big_endian,
+                    block_align,
+                    mmap,
+                )
+            elif chunk_id == b"LIST":
+                # Someday this could be handled properly but for now skip it
+                _skip_unknown_chunk(fid, is_big_endian)
+            elif chunk_id in {b"JUNK", b"Fake"}:
+                # Skip alignment chunks without warning
+                _skip_unknown_chunk(fid, is_big_endian)
+            else:
+                warnings.warn(
+                    "Chunk (non-data) not understood, skipping it.",
+                    WavFileWarning,
+                    stacklevel=2,
+                )
+                _skip_unknown_chunk(fid, is_big_endian)
+    finally:
+        if not hasattr(filename, "read"):
+            fid.close()
+        else:
+            fid.seek(0)
+
+    return fs, data
+
+
+def write(filename, rate, data):
+    """
+    Write a NumPy array as a WAV file.
+
+    Parameters
+    ----------
+    filename : string or open file handle
+        Output wav file.
+    rate : int
+        The sample rate (in samples/sec).
+    data : ndarray
+        A 1-D or 2-D NumPy array of either integer or float data-type.
+
+    Notes
+    -----
+    * Writes a simple uncompressed WAV file.
+    * To write multiple-channels, use a 2-D array of shape
+      (Nsamples, Nchannels).
+    * The bits-per-sample and PCM/float will be determined by the data-type.
+
+    Common data types: [1]_
+
+    =====================  ===========  ===========  =============
+         WAV format            Min          Max       NumPy dtype
+    =====================  ===========  ===========  =============
+    32-bit floating-point  -1.0         +1.0         float32
+    32-bit PCM             -2147483648  +2147483647  int32
+    16-bit PCM             -32768       +32767       int16
+    8-bit PCM              0            255          uint8
+    =====================  ===========  ===========  =============
+
+    Note that 8-bit PCM is unsigned.
+
+    References
+    ----------
+    .. [1] IBM Corporation and Microsoft Corporation, "Multimedia Programming
+       Interface and Data Specifications 1.0", section "Data Format of the
+       Samples", August 1991
+       http://www.tactilemedia.com/info/MCI_Control_Info.html
+
+    Examples
+    --------
+    Create a 100Hz sine wave, sampled at 44100Hz.
+    Write to 16-bit PCM, Mono.
+
+    >>> from scipy.io.wavfile import write
+    >>> samplerate = 44100; fs = 100
+    >>> t = np.linspace(0., 1., samplerate)
+    >>> amplitude = np.iinfo(np.int16).max
+    >>> data = amplitude * np.sin(2. * np.pi * fs * t)
+    >>> write("example.wav", samplerate, data.astype(np.int16))
+
+    """
+    if hasattr(filename, "write"):
+        fid = filename
+    else:
+        # pylint: disable=consider-using-with
+        fid = open(filename, "wb")
+
+    fs = rate
+
+    try:
+        dkind = data.dtype.kind
+        if not (
+            dkind == "i" or dkind == "f" or (dkind == "u" and data.dtype.itemsize == 1)
+        ):
+            raise ValueError(f"Unsupported data type '{data.dtype}'")
+
+        header_data = b""
+
+        header_data += b"RIFF"
+        header_data += b"\x00\x00\x00\x00"
+        header_data += b"WAVE"
+
+        # fmt chunk
+        header_data += b"fmt "
+        if dkind == "f":
+            format_tag = WAVE_FORMAT.IEEE_FLOAT
+        else:
+            format_tag = WAVE_FORMAT.PCM
+        if data.ndim == 1:
+            channels = 1
+        else:
+            channels = data.shape[1]
+        bit_depth = data.dtype.itemsize * 8
+        bytes_per_second = fs * (bit_depth // 8) * channels
+        block_align = channels * (bit_depth // 8)
+
+        fmt_chunk_data = struct.pack(
+            "<HHIIHH",
+            format_tag,
+            channels,
+            fs,
+            bytes_per_second,
+            block_align,
+            bit_depth,
+        )
+        if not (dkind in ("i", "u")):
+            # add cbSize field for non-PCM files
+            fmt_chunk_data += b"\x00\x00"
+
+        header_data += struct.pack("<I", len(fmt_chunk_data))
+        header_data += fmt_chunk_data
+
+        # fact chunk (non-PCM files)
+        if not (dkind in ("i", "u")):
+            header_data += b"fact"
+            header_data += struct.pack("<II", 4, data.shape[0])
+
+        # check data size (needs to be immediately before the data chunk)
+        if ((len(header_data) - 4 - 4) + (4 + 4 + data.nbytes)) > 0xFFFFFFFF:
+            raise ValueError("Data exceeds wave file size limit")
+
+        fid.write(header_data)
+
+        # data chunk
+        fid.write(b"data")
+        fid.write(struct.pack("<I", data.nbytes))
+        if data.dtype.byteorder == ">" or (
+            data.dtype.byteorder == "=" and sys.byteorder == "big"
+        ):
+            data = data.byteswap()
+        _array_tofile(fid, data)
+
+        # Determine file size and place it in correct
+        #  position at start of the file.
+        size = fid.tell()
+        fid.seek(4)
+        fid.write(struct.pack("<I", size - 8))
+
+    finally:
+        if not hasattr(filename, "write"):
+            fid.close()
+        else:
+            fid.seek(0)
+
+
+def _array_tofile(fid, data):
+    # ravel gives a c-contiguous buffer
+    fid.write(data.ravel().view("b").data)
--- a/mlu_370-piper/piper/src/python/piper_train/voice_conversion.py
+++ b/mlu_370-piper/piper/src/python/piper_train/voice_conversion.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+import time
+from pathlib import Path
+
+import librosa
+import torch
+
+from .vits.lightning import VitsModel
+from .vits.mel_processing import spectrogram_torch
+from .vits.wavfile import write as write_wav
+
+_LOGGER = logging.getLogger("piper_train.voice_converstion")
+
+
+def main():
+    """Main entry point"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("audio", nargs="+", help="Audio file(s) to convert")
+    parser.add_argument("--checkpoint", required=True, help="Path to model checkpoint")
+    parser.add_argument(
+        "--output-dir",
+        help="Directory to write WAV file(s) (default: current directory)",
+    )
+    parser.add_argument(
+        "--from-speaker", required=True, type=int, help="Speaker id number of source"
+    )
+    parser.add_argument(
+        "--to-speaker", required=True, type=int, help="Speaker id number of target"
+    )
+    parser.add_argument(
+        "--debug", action="store_true", help="Print DEBUG messages to the console"
+    )
+    args = parser.parse_args()
+
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    _LOGGER.debug(args)
+
+    # -------------------------------------------------------------------------
+
+    args.checkpoint = Path(args.checkpoint)
+    args.output_dir = Path(args.output_dir) if args.output_dir else Path.cwd()
+    args.output_dir.parent.mkdir(parents=True, exist_ok=True)
+
+    model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
+    model_g = model.model_g
+
+    # Inference only
+    model_g.eval()
+
+    with torch.no_grad():
+        model_g.dec.remove_weight_norm()
+
+    try:
+        for audio_path_str in args.audio:
+            audio_path = Path(audio_path_str)
+            wav_path = args.output_dir / f"{audio_path.stem}.wav"
+
+            audio, _sample_rate = librosa.load(path=audio_path_str, sr=22050)
+
+            with torch.no_grad():
+                # NOTE: audio is already in [-1, 1] coming from librosa
+                audio_norm = torch.FloatTensor(audio).unsqueeze(0)
+                spec = spectrogram_torch(
+                    y=audio_norm,
+                    n_fft=1024,
+                    sampling_rate=22050,
+                    hop_size=256,
+                    win_size=1024,
+                    center=False,
+                ).squeeze(0)
+
+                specs = spec.unsqueeze(0)
+                spec_lengths = torch.LongTensor([specs.shape[2]])
+                from_speaker = torch.LongTensor([args.from_speaker])
+                to_speaker = torch.LongTensor([args.to_speaker])
+
+                start_time = time.perf_counter()
+                audio = (
+                    model_g.voice_conversion(
+                        specs, spec_lengths, from_speaker, to_speaker
+                    )[0][0, 0]
+                    .data.cpu()
+                    .float()
+                    .numpy()
+                )
+                end_time = time.perf_counter()
+
+                _LOGGER.debug(
+                    "Converted audio in %s second(s) (%s, shape=%s)",
+                    end_time - start_time,
+                    audio_path.stem,
+                    list(audio.shape),
+                )
+
+                write_wav(str(wav_path), 22050, audio)
+
+                _LOGGER.info("Wrote WAV to %s", wav_path)
+    except KeyboardInterrupt:
+        pass
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python/requirements.txt
+++ b/mlu_370-piper/piper/src/python/requirements.txt
@@ -0,0 +1,7 @@
+cython>=0.29.0,<1
+piper-phonemize~=1.1.0
+librosa>=0.9.2,<1
+numpy>=1.19.0
+onnxruntime>=1.11.0
+pytorch-lightning
+# torch>=1.11.0,<2
--- a/mlu_370-piper/piper/src/python/requirements_dev.txt
+++ b/mlu_370-piper/piper/src/python/requirements_dev.txt
@@ -0,0 +1,7 @@
+black==22.3.0
+coverage==5.0.4
+flake8==3.7.9
+mypy==0.910
+pylint==2.10.2
+pytest==5.4.1
+pytest-cov==2.8.1
--- a/mlu_370-piper/piper/src/python/run-docker
+++ b/mlu_370-piper/piper/src/python/run-docker
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+# Follow instructions here: https://docs.docker.com/config/containers/resource_constraints/#access-an-nvidia-gpu
+docker run \
+  -it \
+  --gpus all \
+  -w "$PWD" \
+  --user "$(id -u):$(id -g)" \
+  --ipc=host \
+  -v "${HOME}:${HOME}" \
+  -v /media/cache:/media/cache:ro \
+  -v /etc/hostname:/etc/hostname:ro \
+  -v /etc/localtime:/etc/localtime:ro \
+  larynx2-train \
+  "$@"
--- a/mlu_370-piper/piper/src/python/scripts/check.sh
+++ b/mlu_370-piper/piper/src/python/scripts/check.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# Runs formatters, linters, and type checkers on Python code.
+
+set -eo pipefail
+
+# Directory of *this* script
+this_dir="$( cd "$( dirname "$0" )" && pwd )"
+
+base_dir="$(realpath "${this_dir}/..")"
+
+# Path to virtual environment
+: "${venv:=${base_dir}/.venv}"
+
+if [ -d "${venv}" ]; then
+    # Activate virtual environment if available
+    source "${venv}/bin/activate"
+fi
+
+python_files=("${base_dir}/piper_train")
+
+# Format code
+black "${python_files[@]}"
+isort "${python_files[@]}"
+
+# Check
+flake8 "${python_files[@]}"
+pylint "${python_files[@]}"
+mypy "${python_files[@]}"
--- a/mlu_370-piper/piper/src/python/scripts/setup.sh
+++ b/mlu_370-piper/piper/src/python/scripts/setup.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+# Directory of *this* script
+this_dir="$( cd "$( dirname "$0" )" && pwd )"
+
+# Base directory of repo
+base_dir="$(realpath "${this_dir}/..")"
+
+# Path to virtual environment
+: "${venv:=${base_dir}/.venv}"
+
+# Python binary to use
+: "${PYTHON=python3}"
+
+python_version="$(${PYTHON} --version)"
+
+# Create virtual environment
+echo "Creating virtual environment at ${venv} (${python_version})"
+rm -rf "${venv}"
+"${PYTHON}" -m venv "${venv}"
+source "${venv}/bin/activate"
+
+# Install Python dependencies
+echo 'Installing Python dependencies'
+pip3 install --upgrade pip
+pip3 install --upgrade wheel setuptools
+
+pip3 install -r "${base_dir}/requirements.txt"
+
+# -----------------------------------------------------------------------------
+
+echo "OK"
--- a/mlu_370-piper/piper/src/python/setup.py
+++ b/mlu_370-piper/piper/src/python/setup.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+from collections import defaultdict
+from pathlib import Path
+
+import setuptools
+from setuptools import setup
+
+this_dir = Path(__file__).parent
+module_dir = this_dir / "piper_train"
+
+# -----------------------------------------------------------------------------
+
+# Load README in as long description
+long_description: str = ""
+readme_path = this_dir / "README.md"
+if readme_path.is_file():
+    long_description = readme_path.read_text(encoding="utf-8")
+
+requirements = []
+requirements_path = this_dir / "requirements.txt"
+if requirements_path.is_file():
+    with open(requirements_path, "r", encoding="utf-8") as requirements_file:
+        requirements = requirements_file.read().splitlines()
+
+version_path = module_dir / "VERSION"
+with open(version_path, "r", encoding="utf-8") as version_file:
+    version = version_file.read().strip()
+
+# -----------------------------------------------------------------------------
+
+setup(
+    name="piper_train",
+    version=version,
+    description="A fast and local neural text to speech system",
+    long_description=long_description,
+    url="http://github.com/rhasspy/piper",
+    author="Michael Hansen",
+    author_email="mike@rhasspy.org",
+    license="MIT",
+    packages=setuptools.find_packages(),
+    package_data={
+        "piper_train": ["VERSION", "py.typed"],
+    },
+    install_requires=requirements,
+    extras_require={':python_version<"3.9"': ["importlib_resources"]},
+    entry_points={
+        "console_scripts": [
+            "piper-train = piper_train.__main__:main",
+        ]
+    },
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Topic :: Text Processing :: Linguistic",
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+    ],
+    keywords="rhasspy tts speech voice",
+)
--- a/mlu_370-piper/piper/src/python_run/.gitignore
+++ b/mlu_370-piper/piper/src/python_run/.gitignore
@@ -0,0 +1,3 @@
+build/
+dist/
+*.egg-info/
--- a/mlu_370-piper/piper/src/python_run/.isort.cfg
+++ b/mlu_370-piper/piper/src/python_run/.isort.cfg
@@ -0,0 +1,6 @@
+[settings]
+multi_line_output=3
+include_trailing_comma=True
+force_grid_wrap=0
+use_parentheses=True
+line_length=88
--- a/mlu_370-piper/piper/src/python_run/MANIFEST.in
+++ b/mlu_370-piper/piper/src/python_run/MANIFEST.in
@@ -0,0 +1,2 @@
+include requirements.txt
+include piper/voices.json
--- a/mlu_370-piper/piper/src/python_run/README_http.md
+++ b/mlu_370-piper/piper/src/python_run/README_http.md
@@ -0,0 +1,27 @@
+# Piper HTTP Server
+
+Install the requirements into your virtual environment:
+
+```sh
+.venv/bin/pip3 install -r requirements_http.txt
+```
+
+Run the web server:
+
+```sh
+.venv/bin/python3 -m piper.http_server --model ...
+```
+
+See `--help` for more options.
+
+Using a `GET` request:
+
+```sh
+curl -G --data-urlencode 'text=This is a test.' -o test.wav 'localhost:5000'
+```
+
+Using a `POST` request:
+
+```sh
+curl -X POST -H 'Content-Type: text/plain' --data 'This is a test.' -o test.wav 'localhost:5000'
+```
--- a/mlu_370-piper/piper/src/python_run/mypy.ini
+++ b/mlu_370-piper/piper/src/python_run/mypy.ini
@@ -0,0 +1,7 @@
+[mypy]
+
+[mypy-onnxruntime.*]
+ignore_missing_imports = True
+
+[mypy-piper_phonemize.*]
+ignore_missing_imports = True
--- a/mlu_370-piper/piper/src/python_run/piper/init.py
+++ b/mlu_370-piper/piper/src/python_run/piper/init.py
@@ -0,0 +1,5 @@
+from .voice import PiperVoice
+
+__all__ = [
+    "PiperVoice",
+]
--- a/mlu_370-piper/piper/src/python_run/piper/main.py
+++ b/mlu_370-piper/piper/src/python_run/piper/main.py
@@ -0,0 +1,159 @@
+import argparse
+import logging
+import sys
+import time
+import wave
+from pathlib import Path
+from typing import Any, Dict
+
+from . import PiperVoice
+from .download import ensure_voice_exists, find_voice, get_voices
+
+_FILE = Path(__file__)
+_DIR = _FILE.parent
+_LOGGER = logging.getLogger(_FILE.stem)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
+    parser.add_argument("-c", "--config", help="Path to model config file")
+    parser.add_argument(
+        "-f",
+        "--output-file",
+        "--output_file",
+        help="Path to output WAV file (default: stdout)",
+    )
+    parser.add_argument(
+        "-d",
+        "--output-dir",
+        "--output_dir",
+        help="Path to output directory (default: cwd)",
+    )
+    parser.add_argument(
+        "--output-raw",
+        "--output_raw",
+        action="store_true",
+        help="Stream raw audio to stdout",
+    )
+    #
+    parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
+    parser.add_argument(
+        "--length-scale", "--length_scale", type=float, help="Phoneme length"
+    )
+    parser.add_argument(
+        "--noise-scale", "--noise_scale", type=float, help="Generator noise"
+    )
+    parser.add_argument(
+        "--noise-w", "--noise_w", type=float, help="Phoneme width noise"
+    )
+    #
+    parser.add_argument("--cuda", action="store_true", help="Use GPU")
+    #
+    parser.add_argument(
+        "--sentence-silence",
+        "--sentence_silence",
+        type=float,
+        default=0.0,
+        help="Seconds of silence after each sentence",
+    )
+    #
+    parser.add_argument(
+        "--data-dir",
+        "--data_dir",
+        action="append",
+        default=[str(Path.cwd())],
+        help="Data directory to check for downloaded models (default: current directory)",
+    )
+    parser.add_argument(
+        "--download-dir",
+        "--download_dir",
+        help="Directory to download voices into (default: first data dir)",
+    )
+    #
+    parser.add_argument(
+        "--update-voices",
+        action="store_true",
+        help="Download latest voices.json during startup",
+    )
+    #
+    parser.add_argument(
+        "--debug", action="store_true", help="Print DEBUG messages to console"
+    )
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
+    _LOGGER.debug(args)
+
+    if not args.download_dir:
+        # Download to first data directory by default
+        args.download_dir = args.data_dir[0]
+
+    # Download voice if file doesn't exist
+    model_path = Path(args.model)
+    if not model_path.exists():
+        # Load voice info
+        voices_info = get_voices(args.download_dir, update_voices=args.update_voices)
+
+        # Resolve aliases for backwards compatibility with old voice names
+        aliases_info: Dict[str, Any] = {}
+        for voice_info in voices_info.values():
+            for voice_alias in voice_info.get("aliases", []):
+                aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
+
+        voices_info.update(aliases_info)
+        ensure_voice_exists(args.model, args.data_dir, args.download_dir, voices_info)
+        args.model, args.config = find_voice(args.model, args.data_dir)
+
+    # Load voice
+    voice = PiperVoice.load(args.model, config_path=args.config, use_cuda=args.cuda)
+    synthesize_args = {
+        "speaker_id": args.speaker,
+        "length_scale": args.length_scale,
+        "noise_scale": args.noise_scale,
+        "noise_w": args.noise_w,
+        "sentence_silence": args.sentence_silence,
+    }
+
+    if args.output_raw:
+        # Read line-by-line
+        for line in sys.stdin:
+            line = line.strip()
+            if not line:
+                continue
+
+            # Write raw audio to stdout as its produced
+            audio_stream = voice.synthesize_stream_raw(line, **synthesize_args)
+            for audio_bytes in audio_stream:
+                sys.stdout.buffer.write(audio_bytes)
+                sys.stdout.buffer.flush()
+    elif args.output_dir:
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Read line-by-line
+        for line in sys.stdin:
+            line = line.strip()
+            if not line:
+                continue
+
+            wav_path = output_dir / f"{time.monotonic_ns()}.wav"
+            with wave.open(str(wav_path), "wb") as wav_file:
+                voice.synthesize(line, wav_file, **synthesize_args)
+
+            _LOGGER.info("Wrote %s", wav_path)
+    else:
+        # Read entire input
+        text = sys.stdin.read()
+
+        if (not args.output_file) or (args.output_file == "-"):
+            # Write to stdout
+            with wave.open(sys.stdout.buffer, "wb") as wav_file:
+                voice.synthesize(text, wav_file, **synthesize_args)
+        else:
+            # Write to file
+            with wave.open(args.output_file, "wb") as wav_file:
+                voice.synthesize(text, wav_file, **synthesize_args)
+
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python_run/piper/config.py
+++ b/mlu_370-piper/piper/src/python_run/piper/config.py
@@ -0,0 +1,53 @@
+"""Piper configuration"""
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Dict, Mapping, Sequence
+
+
+class PhonemeType(str, Enum):
+    ESPEAK = "espeak"
+    TEXT = "text"
+
+
+@dataclass
+class PiperConfig:
+    """Piper configuration"""
+
+    num_symbols: int
+    """Number of phonemes"""
+
+    num_speakers: int
+    """Number of speakers"""
+
+    sample_rate: int
+    """Sample rate of output audio"""
+
+    espeak_voice: str
+    """Name of espeak-ng voice or alphabet"""
+
+    length_scale: float
+    noise_scale: float
+    noise_w: float
+
+    phoneme_id_map: Mapping[str, Sequence[int]]
+    """Phoneme -> [id,]"""
+
+    phoneme_type: PhonemeType
+    """espeak or text"""
+
+    @staticmethod
+    def from_dict(config: Dict[str, Any]) -> "PiperConfig":
+        inference = config.get("inference", {})
+
+        return PiperConfig(
+            num_symbols=config["num_symbols"],
+            num_speakers=config["num_speakers"],
+            sample_rate=config["audio"]["sample_rate"],
+            noise_scale=inference.get("noise_scale", 0.667),
+            length_scale=inference.get("length_scale", 1.0),
+            noise_w=inference.get("noise_w", 0.8),
+            #
+            espeak_voice=config["espeak"]["voice"],
+            phoneme_id_map=config["phoneme_id_map"],
+            phoneme_type=PhonemeType(config.get("phoneme_type", PhonemeType.ESPEAK)),
+        )
--- a/mlu_370-piper/piper/src/python_run/piper/const.py
+++ b/mlu_370-piper/piper/src/python_run/piper/const.py
@@ -0,0 +1,5 @@
+"""Constants"""
+
+PAD = "_"  # padding (0)
+BOS = "^"  # beginning of sentence
+EOS = "$"  # end of sentence
--- a/mlu_370-piper/piper/src/python_run/piper/download.py
+++ b/mlu_370-piper/piper/src/python_run/piper/download.py
@@ -0,0 +1,139 @@
+"""Utility for downloading Piper voices."""
+import json
+import logging
+import shutil
+from pathlib import Path
+from typing import Any, Dict, Iterable, Set, Tuple, Union
+from urllib.request import urlopen
+
+from .file_hash import get_file_hash
+
+URL_FORMAT = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/{file}"
+
+_DIR = Path(__file__).parent
+_LOGGER = logging.getLogger(__name__)
+
+_SKIP_FILES = {"MODEL_CARD"}
+
+
+class VoiceNotFoundError(Exception):
+    pass
+
+
+def get_voices(
+    download_dir: Union[str, Path], update_voices: bool = False
+) -> Dict[str, Any]:
+    """Loads available voices from downloaded or embedded JSON file."""
+    download_dir = Path(download_dir)
+    voices_download = download_dir / "voices.json"
+
+    if update_voices:
+        # Download latest voices.json
+        voices_url = URL_FORMAT.format(file="voices.json")
+        _LOGGER.debug("Downloading %s to %s", voices_url, voices_download)
+        with urlopen(voices_url) as response, open(
+            voices_download, "wb"
+        ) as download_file:
+            shutil.copyfileobj(response, download_file)
+
+    # Prefer downloaded file to embedded
+    voices_embedded = _DIR / "voices.json"
+    voices_path = voices_download if voices_download.exists() else voices_embedded
+
+    _LOGGER.debug("Loading %s", voices_path)
+    with open(voices_path, "r", encoding="utf-8") as voices_file:
+        return json.load(voices_file)
+
+
+def ensure_voice_exists(
+    name: str,
+    data_dirs: Iterable[Union[str, Path]],
+    download_dir: Union[str, Path],
+    voices_info: Dict[str, Any],
+):
+    assert data_dirs, "No data dirs"
+    if name not in voices_info:
+        raise VoiceNotFoundError(name)
+
+    voice_info = voices_info[name]
+    voice_files = voice_info["files"]
+    files_to_download: Set[str] = set()
+
+    for data_dir in data_dirs:
+        data_dir = Path(data_dir)
+
+        # Check sizes/hashes
+        for file_path, file_info in voice_files.items():
+            if file_path in files_to_download:
+                # Already planning to download
+                continue
+
+            file_name = Path(file_path).name
+            if file_name in _SKIP_FILES:
+                continue
+
+            data_file_path = data_dir / file_name
+            _LOGGER.debug("Checking %s", data_file_path)
+            if not data_file_path.exists():
+                _LOGGER.debug("Missing %s", data_file_path)
+                files_to_download.add(file_path)
+                continue
+
+            expected_size = file_info["size_bytes"]
+            actual_size = data_file_path.stat().st_size
+            if expected_size != actual_size:
+                _LOGGER.warning(
+                    "Wrong size (expected=%s, actual=%s) for %s",
+                    expected_size,
+                    actual_size,
+                    data_file_path,
+                )
+                files_to_download.add(file_path)
+                continue
+
+            expected_hash = file_info["md5_digest"]
+            actual_hash = get_file_hash(data_file_path)
+            if expected_hash != actual_hash:
+                _LOGGER.warning(
+                    "Wrong hash (expected=%s, actual=%s) for %s",
+                    expected_hash,
+                    actual_hash,
+                    data_file_path,
+                )
+                files_to_download.add(file_path)
+                continue
+
+    if (not voice_files) and (not files_to_download):
+        raise ValueError(f"Unable to find or download voice: {name}")
+
+    # Download missing files
+    download_dir = Path(download_dir)
+
+    for file_path in files_to_download:
+        file_name = Path(file_path).name
+        if file_name in _SKIP_FILES:
+            continue
+
+        file_url = URL_FORMAT.format(file=file_path)
+        download_file_path = download_dir / file_name
+        download_file_path.parent.mkdir(parents=True, exist_ok=True)
+
+        _LOGGER.debug("Downloading %s to %s", file_url, download_file_path)
+        with urlopen(file_url) as response, open(
+            download_file_path, "wb"
+        ) as download_file:
+            shutil.copyfileobj(response, download_file)
+
+        _LOGGER.info("Downloaded %s (%s)", download_file_path, file_url)
+
+
+def find_voice(name: str, data_dirs: Iterable[Union[str, Path]]) -> Tuple[Path, Path]:
+    for data_dir in data_dirs:
+        data_dir = Path(data_dir)
+        onnx_path = data_dir / f"{name}.onnx"
+        config_path = data_dir / f"{name}.onnx.json"
+
+        if onnx_path.exists() and config_path.exists():
+            return onnx_path, config_path
+
+    raise ValueError(f"Missing files for voice {name}")
--- a/mlu_370-piper/piper/src/python_run/piper/file_hash.py
+++ b/mlu_370-piper/piper/src/python_run/piper/file_hash.py
@@ -0,0 +1,46 @@
+import argparse
+import hashlib
+import json
+import sys
+from pathlib import Path
+from typing import Union
+
+
+def get_file_hash(path: Union[str, Path], bytes_per_chunk: int = 8192) -> str:
+    """Hash a file in chunks using md5."""
+    path_hash = hashlib.md5()
+    with open(path, "rb") as path_file:
+        chunk = path_file.read(bytes_per_chunk)
+        while chunk:
+            path_hash.update(chunk)
+            chunk = path_file.read(bytes_per_chunk)
+
+    return path_hash.hexdigest()
+
+
+# -----------------------------------------------------------------------------
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("file", nargs="+")
+    parser.add_argument("--dir", help="Parent directory")
+    args = parser.parse_args()
+
+    if args.dir:
+        args.dir = Path(args.dir)
+
+    hashes = {}
+    for path_str in args.file:
+        path = Path(path_str)
+        path_hash = get_file_hash(path)
+        if args.dir:
+            path = path.relative_to(args.dir)
+
+        hashes[str(path)] = path_hash
+
+    json.dump(hashes, sys.stdout)
+
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python_run/piper/http_server.py
+++ b/mlu_370-piper/piper/src/python_run/piper/http_server.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+import argparse
+import io
+import logging
+import wave
+from pathlib import Path
+from typing import Any, Dict
+
+from flask import Flask, request
+
+from . import PiperVoice
+from .download import ensure_voice_exists, find_voice, get_voices
+
+_LOGGER = logging.getLogger()
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", default="0.0.0.0", help="HTTP server host")
+    parser.add_argument("--port", type=int, default=5000, help="HTTP server port")
+    #
+    parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
+    parser.add_argument("-c", "--config", help="Path to model config file")
+    #
+    parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
+    parser.add_argument(
+        "--length-scale", "--length_scale", type=float, help="Phoneme length"
+    )
+    parser.add_argument(
+        "--noise-scale", "--noise_scale", type=float, help="Generator noise"
+    )
+    parser.add_argument(
+        "--noise-w", "--noise_w", type=float, help="Phoneme width noise"
+    )
+    #
+    parser.add_argument("--cuda", action="store_true", help="Use GPU")
+    #
+    parser.add_argument(
+        "--sentence-silence",
+        "--sentence_silence",
+        type=float,
+        default=0.0,
+        help="Seconds of silence after each sentence",
+    )
+    #
+    parser.add_argument(
+        "--data-dir",
+        "--data_dir",
+        action="append",
+        default=[str(Path.cwd())],
+        help="Data directory to check for downloaded models (default: current directory)",
+    )
+    parser.add_argument(
+        "--download-dir",
+        "--download_dir",
+        help="Directory to download voices into (default: first data dir)",
+    )
+    #
+    parser.add_argument(
+        "--update-voices",
+        action="store_true",
+        help="Download latest voices.json during startup",
+    )
+    #
+    parser.add_argument(
+        "--debug", action="store_true", help="Print DEBUG messages to console"
+    )
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
+    _LOGGER.debug(args)
+
+    if not args.download_dir:
+        # Download to first data directory by default
+        args.download_dir = args.data_dir[0]
+
+    # Download voice if file doesn't exist
+    model_path = Path(args.model)
+    if not model_path.exists():
+        # Load voice info
+        voices_info = get_voices(args.download_dir, update_voices=args.update_voices)
+
+        # Resolve aliases for backwards compatibility with old voice names
+        aliases_info: Dict[str, Any] = {}
+        for voice_info in voices_info.values():
+            for voice_alias in voice_info.get("aliases", []):
+                aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
+
+        voices_info.update(aliases_info)
+        ensure_voice_exists(args.model, args.data_dir, args.download_dir, voices_info)
+        args.model, args.config = find_voice(args.model, args.data_dir)
+
+    # Load voice
+    voice = PiperVoice.load(args.model, config_path=args.config, use_cuda=args.cuda)
+    synthesize_args = {
+        "speaker_id": args.speaker,
+        "length_scale": args.length_scale,
+        "noise_scale": args.noise_scale,
+        "noise_w": args.noise_w,
+        "sentence_silence": args.sentence_silence,
+    }
+
+    # Create web server
+    app = Flask(__name__)
+
+    @app.route("/", methods=["GET", "POST"])
+    def app_synthesize() -> bytes:
+        if request.method == "POST":
+            text = request.data.decode("utf-8")
+        else:
+            text = request.args.get("text", "")
+
+        text = text.strip()
+        if not text:
+            raise ValueError("No text provided")
+
+        _LOGGER.debug("Synthesizing text: %s", text)
+        with io.BytesIO() as wav_io:
+            with wave.open(wav_io, "wb") as wav_file:
+                voice.synthesize(text, wav_file, **synthesize_args)
+
+            return wav_io.getvalue()
+
+    app.run(host=args.host, port=args.port)
+
+
+if __name__ == "__main__":
+    main()
--- a/mlu_370-piper/piper/src/python_run/piper/util.py
+++ b/mlu_370-piper/piper/src/python_run/piper/util.py
@@ -0,0 +1,12 @@
+"""Utilities"""
+import numpy as np
+
+
+def audio_float_to_int16(
+    audio: np.ndarray, max_wav_value: float = 32767.0
+) -> np.ndarray:
+    """Normalize audio and convert to int16 range"""
+    audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
+    audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
+    audio_norm = audio_norm.astype("int16")
+    return audio_norm
--- a/mlu_370-piper/piper/src/python_run/piper/voice.py
+++ b/mlu_370-piper/piper/src/python_run/piper/voice.py
@@ -0,0 +1,185 @@
+import json
+import logging
+import wave
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+import onnxruntime
+from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run
+
+from .config import PhonemeType, PiperConfig
+from .const import BOS, EOS, PAD
+from .util import audio_float_to_int16
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@dataclass
+class PiperVoice:
+    session: onnxruntime.InferenceSession
+    config: PiperConfig
+
+    @staticmethod
+    def load(
+        model_path: Union[str, Path],
+        config_path: Optional[Union[str, Path]] = None,
+        use_cuda: bool = False,
+    ) -> "PiperVoice":
+        """Load an ONNX model and config."""
+        if config_path is None:
+            config_path = f"{model_path}.json"
+
+        with open(config_path, "r", encoding="utf-8") as config_file:
+            config_dict = json.load(config_file)
+
+        providers: List[Union[str, Tuple[str, Dict[str, Any]]]]
+        if use_cuda:
+            providers = [
+                (
+                    "CUDAExecutionProvider",
+                    {"cudnn_conv_algo_search": "HEURISTIC"},
+                )
+            ]
+        else:
+            providers = ["CPUExecutionProvider"]
+
+        return PiperVoice(
+            config=PiperConfig.from_dict(config_dict),
+            session=onnxruntime.InferenceSession(
+                str(model_path),
+                sess_options=onnxruntime.SessionOptions(),
+                providers=providers,
+            ),
+        )
+
+    def phonemize(self, text: str) -> List[List[str]]:
+        """Text to phonemes grouped by sentence."""
+        if self.config.phoneme_type == PhonemeType.ESPEAK:
+            if self.config.espeak_voice == "ar":
+                # Arabic diacritization
+                # https://github.com/mush42/libtashkeel/
+                text = tashkeel_run(text)
+
+            return phonemize_espeak(text, self.config.espeak_voice)
+
+        if self.config.phoneme_type == PhonemeType.TEXT:
+            return phonemize_codepoints(text)
+
+        raise ValueError(f"Unexpected phoneme type: {self.config.phoneme_type}")
+
+    def phonemes_to_ids(self, phonemes: List[str]) -> List[int]:
+        """Phonemes to ids."""
+        id_map = self.config.phoneme_id_map
+        ids: List[int] = list(id_map[BOS])
+
+        for phoneme in phonemes:
+            if phoneme not in id_map:
+                _LOGGER.warning("Missing phoneme from id map: %s", phoneme)
+                continue
+
+            ids.extend(id_map[phoneme])
+            ids.extend(id_map[PAD])
+
+        ids.extend(id_map[EOS])
+
+        return ids
+
+    def synthesize(
+        self,
+        text: str,
+        wav_file: wave.Wave_write,
+        speaker_id: Optional[int] = None,
+        length_scale: Optional[float] = None,
+        noise_scale: Optional[float] = None,
+        noise_w: Optional[float] = None,
+        sentence_silence: float = 0.0,
+    ):
+        """Synthesize WAV audio from text."""
+        wav_file.setframerate(self.config.sample_rate)
+        wav_file.setsampwidth(2)  # 16-bit
+        wav_file.setnchannels(1)  # mono
+
+        for audio_bytes in self.synthesize_stream_raw(
+            text,
+            speaker_id=speaker_id,
+            length_scale=length_scale,
+            noise_scale=noise_scale,
+            noise_w=noise_w,
+            sentence_silence=sentence_silence,
+        ):
+            wav_file.writeframes(audio_bytes)
+
+    def synthesize_stream_raw(
+        self,
+        text: str,
+        speaker_id: Optional[int] = None,
+        length_scale: Optional[float] = None,
+        noise_scale: Optional[float] = None,
+        noise_w: Optional[float] = None,
+        sentence_silence: float = 0.0,
+    ) -> Iterable[bytes]:
+        """Synthesize raw audio per sentence from text."""
+        sentence_phonemes = self.phonemize(text)
+
+        # 16-bit mono
+        num_silence_samples = int(sentence_silence * self.config.sample_rate)
+        silence_bytes = bytes(num_silence_samples * 2)
+
+        for phonemes in sentence_phonemes:
+            phoneme_ids = self.phonemes_to_ids(phonemes)
+            yield self.synthesize_ids_to_raw(
+                phoneme_ids,
+                speaker_id=speaker_id,
+                length_scale=length_scale,
+                noise_scale=noise_scale,
+                noise_w=noise_w,
+            ) + silence_bytes
+
+    def synthesize_ids_to_raw(
+        self,
+        phoneme_ids: List[int],
+        speaker_id: Optional[int] = None,
+        length_scale: Optional[float] = None,
+        noise_scale: Optional[float] = None,
+        noise_w: Optional[float] = None,
+    ) -> bytes:
+        """Synthesize raw audio from phoneme ids."""
+        if length_scale is None:
+            length_scale = self.config.length_scale
+
+        if noise_scale is None:
+            noise_scale = self.config.noise_scale
+
+        if noise_w is None:
+            noise_w = self.config.noise_w
+
+        phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
+        phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
+        scales = np.array(
+            [noise_scale, length_scale, noise_w],
+            dtype=np.float32,
+        )
+
+        args = {
+            "input": phoneme_ids_array,
+            "input_lengths": phoneme_ids_lengths,
+            "scales": scales
+        }
+
+        if self.config.num_speakers <= 1:
+            speaker_id = None
+
+        if (self.config.num_speakers > 1) and (speaker_id is None):
+            # Default speaker
+            speaker_id = 0
+
+        if speaker_id is not None:
+            sid = np.array([speaker_id], dtype=np.int64)
+            args["sid"] = sid
+
+        # Synthesize through Onnx
+        audio = self.session.run(None, args, )[0].squeeze((0, 1))
+        audio = audio_float_to_int16(audio.squeeze())
+        return audio.tobytes()
--- a/mlu_370-piper/piper/src/python_run/piper/voices.json
+++ b/mlu_370-piper/piper/src/python_run/piper/voices.json
--- a/mlu_370-piper/piper/src/python_run/py.typed
+++ b/mlu_370-piper/piper/src/python_run/py.typed
--- a/mlu_370-piper/piper/src/python_run/pylintrc
+++ b/mlu_370-piper/piper/src/python_run/pylintrc
@@ -0,0 +1,37 @@
+[MESSAGES CONTROL]
+disable=
+  format,
+  abstract-method,
+  cyclic-import,
+  duplicate-code,
+  global-statement,
+  import-outside-toplevel,
+  inconsistent-return-statements,
+  locally-disabled,
+  not-context-manager,
+  too-few-public-methods,
+  too-many-arguments,
+  too-many-branches,
+  too-many-instance-attributes,
+  too-many-lines,
+  too-many-locals,
+  too-many-public-methods,
+  too-many-return-statements,
+  too-many-statements,
+  too-many-boolean-expressions,
+  unnecessary-pass,
+  unused-argument,
+  broad-except,
+  too-many-nested-blocks,
+  invalid-name,
+  unused-import,
+  fixme,
+  useless-super-delegation,
+  missing-module-docstring,
+  missing-class-docstring,
+  missing-function-docstring,
+  import-error,
+  relative-beyond-top-level
+
+[FORMAT]
+expected-line-ending-format=LF
--- a/mlu_370-piper/piper/src/python_run/requirements.txt
+++ b/mlu_370-piper/piper/src/python_run/requirements.txt
@@ -0,0 +1,2 @@
+piper-phonemize~=1.1.0
+onnxruntime>=1.11.0,<2
--- a/mlu_370-piper/piper/src/python_run/requirements_dev.txt
+++ b/mlu_370-piper/piper/src/python_run/requirements_dev.txt
@@ -0,0 +1,5 @@
+black==22.12.0
+flake8==6.0.0
+isort==5.11.3
+mypy==0.991
+pylint==2.15.9
--- a/mlu_370-piper/piper/src/python_run/requirements_gpu.txt
+++ b/mlu_370-piper/piper/src/python_run/requirements_gpu.txt
@@ -0,0 +1 @@
+onnxruntime-gpu>=1.11.0,<2
--- a/mlu_370-piper/piper/src/python_run/requirements_http.txt
+++ b/mlu_370-piper/piper/src/python_run/requirements_http.txt
@@ -0,0 +1 @@
+flask>=3,<4
--- a/mlu_370-piper/piper/src/python_run/script/format
+++ b/mlu_370-piper/piper/src/python_run/script/format
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+import subprocess
+import venv
+from pathlib import Path
+
+_DIR = Path(__file__).parent
+_PROGRAM_DIR = _DIR.parent
+_VENV_DIR = _PROGRAM_DIR / ".venv"
+_MODULE_DIR = _PROGRAM_DIR / "piper"
+
+context = venv.EnvBuilder().ensure_directories(_VENV_DIR)
+subprocess.check_call([context.env_exe, "-m", "black", str(_MODULE_DIR)])
+subprocess.check_call([context.env_exe, "-m", "isort", str(_MODULE_DIR)])
--- a/mlu_370-piper/piper/src/python_run/script/lint
+++ b/mlu_370-piper/piper/src/python_run/script/lint
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+import subprocess
+import venv
+from pathlib import Path
+
+_DIR = Path(__file__).parent
+_PROGRAM_DIR = _DIR.parent
+_VENV_DIR = _PROGRAM_DIR / ".venv"
+_MODULE_DIR = _PROGRAM_DIR / "piper"
+
+context = venv.EnvBuilder().ensure_directories(_VENV_DIR)
+subprocess.check_call([context.env_exe, "-m", "black", str(_MODULE_DIR), "--check"])
+subprocess.check_call([context.env_exe, "-m", "isort", str(_MODULE_DIR), "--check"])
+subprocess.check_call([context.env_exe, "-m", "flake8", str(_MODULE_DIR)])
+subprocess.check_call([context.env_exe, "-m", "pylint", str(_MODULE_DIR)])
+subprocess.check_call([context.env_exe, "-m", "mypy", str(_MODULE_DIR)])
--- a/mlu_370-piper/piper/src/python_run/script/piper
+++ b/mlu_370-piper/piper/src/python_run/script/piper
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+import sys
+import subprocess
+import venv
+from pathlib import Path
+
+_DIR = Path(__file__).parent
+_PROGRAM_DIR = _DIR.parent
+_VENV_DIR = _PROGRAM_DIR / ".venv"
+
+context = venv.EnvBuilder().ensure_directories(_VENV_DIR)
+subprocess.check_call([context.env_exe, "-m", "piper"] + sys.argv[1:])
--- a/mlu_370-piper/piper/src/python_run/script/setup
+++ b/mlu_370-piper/piper/src/python_run/script/setup
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+import subprocess
+import venv
+from pathlib import Path
+
+_DIR = Path(__file__).parent
+_PROGRAM_DIR = _DIR.parent
+_VENV_DIR = _PROGRAM_DIR / ".venv"
+
+
+# Create virtual environment
+builder = venv.EnvBuilder(with_pip=True)
+context = builder.ensure_directories(_VENV_DIR)
+builder.create(_VENV_DIR)
+
+# Upgrade dependencies
+pip = [context.env_exe, "-m", "pip"]
+subprocess.check_call(pip + ["install", "--upgrade", "pip"])
+subprocess.check_call(pip + ["install", "--upgrade", "setuptools", "wheel"])
+
+# Install requirements
+subprocess.check_call(
+    pip
+    + [
+        "install",
+        "-f",
+        "https://synesthesiam.github.io/prebuilt-apps/",
+        "-r",
+        str(_PROGRAM_DIR / "requirements.txt"),
+    ]
+)
--- a/mlu_370-piper/piper/src/python_run/setup.cfg
+++ b/mlu_370-piper/piper/src/python_run/setup.cfg
@@ -0,0 +1,22 @@
+[flake8]
+# To work with Black
+max-line-length = 88
+# E501: line too long
+# W503: Line break occurred before a binary operator
+# E203: Whitespace before ':'
+# D202 No blank lines allowed after function docstring
+# W504 line break after binary operator
+ignore =
+    E501,
+    W503,
+    E203,
+    D202,
+    W504
+
+[isort]
+multi_line_output = 3
+include_trailing_comma=True
+force_grid_wrap=0
+use_parentheses=True
+line_length=88
+indent = "    "
--- a/mlu_370-piper/piper/src/python_run/setup.py
+++ b/mlu_370-piper/piper/src/python_run/setup.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+from pathlib import Path
+
+import setuptools
+from setuptools import setup
+
+this_dir = Path(__file__).parent
+module_dir = this_dir / "piper"
+
+requirements = []
+requirements_path = this_dir / "requirements.txt"
+if requirements_path.is_file():
+    with open(requirements_path, "r", encoding="utf-8") as requirements_file:
+        requirements = requirements_file.read().splitlines()
+
+data_files = [module_dir / "voices.json"]
+
+# -----------------------------------------------------------------------------
+
+setup(
+    name="piper-tts",
+    version="1.2.0",
+    description="A fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4.",
+    url="http://github.com/rhasspy/piper",
+    author="Michael Hansen",
+    author_email="mike@rhasspy.org",
+    license="MIT",
+    packages=setuptools.find_packages(),
+    package_data={"piper": [str(p.relative_to(module_dir)) for p in data_files]},
+    entry_points={
+        "console_scripts": [
+            "piper = piper.__main__:main",
+        ]
+    },
+    install_requires=requirements,
+    extras_require={"gpu": ["onnxruntime-gpu>=1.11.0,<2"], "http": ["flask>=3,<4"]},
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Topic :: Text Processing :: Linguistic",
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+    ],
+    keywords="rhasspy piper tts",
+)