merge code repo for f5 and gpt and kokoro
This commit is contained in:
7
Dockerfile_f5
Normal file
7
Dockerfile_f5
Normal file
@@ -0,0 +1,7 @@
|
||||
FROM corex:3.2.1
|
||||
|
||||
WORKDIR /workspace
|
||||
COPY requirements_f5.txt constraints_f5.txt f5_server.py launch_f5.sh /workspace/
|
||||
RUN pip install -r requirements_f5.txt -c constraints_f5.txt
|
||||
|
||||
ENTRYPOINT ["/bin/bash", "launch_f5.sh"]
|
||||
10
Dockerfile_gsv
Normal file
10
Dockerfile_gsv
Normal file
@@ -0,0 +1,10 @@
|
||||
FROM corex:3.2.1
|
||||
|
||||
WORKDIR /workspace
|
||||
COPY GPT-SoVITS constraints_gsv.txt gsv_server.py launch_gsv.sh /workspace/
|
||||
RUN pip install -r GPT-SOVITS/extra-req.txt --no-deps \
|
||||
&& pip install -r GPT-SoVITS/requirements.txt -c constraints_gsv.txt \
|
||||
&& apt update \
|
||||
&& apt install -y ffmpeg libsox-dev
|
||||
|
||||
ENTRYPOINT ["/bin/bash", "launch_gsv.sh"]
|
||||
9
Dockerfile_kokoro
Normal file
9
Dockerfile_kokoro
Normal file
@@ -0,0 +1,9 @@
|
||||
FROM corex:3.2.1
|
||||
|
||||
WORKDIR /workspace
|
||||
COPY requirements_kokoro.txt constraints_kokoro.txt kokoro_server.py launch_kokoro.sh /workspace/
|
||||
RUN pip install -r requirements_kokoro.txt -c constraints_kokoro.txt \
|
||||
&& apt update \
|
||||
&& apt install -y espeak-ng
|
||||
|
||||
ENTRYPOINT ["/bin/bash", "launch_kokoro.sh"]
|
||||
5
README.md
Normal file
5
README.md
Normal file
@@ -0,0 +1,5 @@
|
||||
# tiangai100-f5-tts
|
||||
# tiangai100-kokoro-tts
|
||||
# tiangai100-gpt-sovits
|
||||
|
||||
【语音合成】
|
||||
1
constraints_f5.txt
Normal file
1
constraints_f5.txt
Normal file
@@ -0,0 +1 @@
|
||||
torch==2.1.0+corex.3.2.1
|
||||
1
constraints_gsv.txt
Normal file
1
constraints_gsv.txt
Normal file
@@ -0,0 +1 @@
|
||||
torch==2.1.0+corex.3.2.1
|
||||
1
constraints_kokoro.txt
Normal file
1
constraints_kokoro.txt
Normal file
@@ -0,0 +1 @@
|
||||
torch==2.1.0+corex.3.2.1
|
||||
133
f5_server.py
Normal file
133
f5_server.py
Normal file
@@ -0,0 +1,133 @@
|
||||
import torch
|
||||
|
||||
torch.backends.cuda.enable_flash_sdp(False)
|
||||
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||
torch.backends.cuda.enable_math_sdp(True)
|
||||
|
||||
from torch import Tensor
|
||||
from typing import Optional, List
|
||||
import torch.nn.functional as F
|
||||
|
||||
# def custom_conv1d_forward(self, input: Tensor, debug=False) -> Tensor:
|
||||
# with torch.amp.autocast(input.device.type, dtype=torch.float):
|
||||
# return self._conv_forward(input, self.weight, self.bias)
|
||||
|
||||
# torch.nn.Conv1d.forward = custom_conv1d_forward
|
||||
|
||||
def conv_transpose1d_forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
|
||||
if self.padding_mode != 'zeros':
|
||||
raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d')
|
||||
|
||||
assert isinstance(self.padding, tuple)
|
||||
# One cannot replace List by Tuple or Sequence in "_output_padding" because
|
||||
# TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
|
||||
num_spatial_dims = 1
|
||||
output_padding = self._output_padding(
|
||||
input, output_size, self.stride, self.padding, self.kernel_size, # type: ignore[arg-type]
|
||||
num_spatial_dims, self.dilation) # type: ignore[arg-type]
|
||||
with torch.amp.autocast('cuda', dtype=torch.float16):
|
||||
return F.conv_transpose1d(
|
||||
input, self.weight, self.bias, self.stride, self.padding,
|
||||
output_padding, self.groups, self.dilation).float()
|
||||
|
||||
torch.nn.ConvTranspose1d.forward = conv_transpose1d_forward
|
||||
|
||||
from f5_tts.infer.utils_infer import (
|
||||
load_vocoder,
|
||||
load_model,
|
||||
chunk_text,
|
||||
infer_batch_process,
|
||||
)
|
||||
from omegaconf import OmegaConf
|
||||
from hydra.utils import get_class
|
||||
import torchaudio
|
||||
import io
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi import UploadFile, File, Form
|
||||
from fastapi.responses import StreamingResponse, JSONResponse
|
||||
from contextlib import asynccontextmanager
|
||||
import uvicorn
|
||||
import os
|
||||
|
||||
import logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s %(name)-12s %(levelname)-4s %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
level=os.environ.get("LOGLEVEL", "INFO"),
|
||||
)
|
||||
logger = logging.getLogger(__file__)
|
||||
|
||||
|
||||
model_dir = os.getenv('MODEL_DIR', '/models/SWivid/F5-TTS')
|
||||
vocoder_dir = os.getenv('VOCODER_DIR', '/models/charactr/vocos-mel-24khz')
|
||||
|
||||
ema_model = None
|
||||
vocoder = None
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
|
||||
def init():
|
||||
global ema_model, vocoder
|
||||
# load vocoder
|
||||
vocoder_name = 'vocos'
|
||||
vocoder = load_vocoder(vocoder_name=vocoder_name, is_local=True, local_path=vocoder_dir, device=device)
|
||||
|
||||
# load TTS model
|
||||
model_cfg = OmegaConf.load('/workspace/F5-TTS/src/f5_tts/configs/F5TTS_v1_Base.yaml')
|
||||
model_cls = get_class(f'f5_tts.model.{model_cfg.model.backbone}')
|
||||
model_arc = model_cfg.model.arch
|
||||
ckpt_file = os.path.join(model_dir, 'F5TTS_v1_Base/model_1250000.safetensors')
|
||||
vocab_file = os.path.join(model_dir, 'F5TTS_v1_Base/vocab.txt')
|
||||
ema_model = load_model(
|
||||
model_cls, model_arc, ckpt_file, mel_spec_type=vocoder_name, vocab_file=vocab_file, device=device
|
||||
)
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
init()
|
||||
yield
|
||||
pass
|
||||
|
||||
app = FastAPI(lifespan=lifespan)
|
||||
|
||||
def tts_generate(gen_text, ref_audio, ref_text):
|
||||
global ema_model, vocoder
|
||||
|
||||
audio, sr = torchaudio.load(io.BytesIO(ref_audio))
|
||||
max_chars = min(int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (22 - audio.shape[-1] / sr)), 135)
|
||||
gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
|
||||
for gen_audio, gen_sr in infer_batch_process(
|
||||
(audio, sr),
|
||||
ref_text,
|
||||
gen_text_batches,
|
||||
ema_model,
|
||||
vocoder,
|
||||
device=device,
|
||||
streaming=True,
|
||||
chunk_size=int(24e6),
|
||||
# nfe_step=16,
|
||||
):
|
||||
yield gen_audio.tobytes()
|
||||
|
||||
# return 24kHz pcm16
|
||||
@app.post("/generate")
|
||||
async def generate(
|
||||
ref_audio: UploadFile = File(...),
|
||||
ref_text: str = Form(...),
|
||||
text: str = Form(...)
|
||||
):
|
||||
audio_bytes = await ref_audio.read()
|
||||
return StreamingResponse(
|
||||
tts_generate(text, ref_audio=audio_bytes, ref_text=ref_text),
|
||||
media_type="audio/wav"
|
||||
)
|
||||
|
||||
|
||||
@app.get("/ready")
|
||||
@app.get("/health")
|
||||
async def ready():
|
||||
return JSONResponse(status_code=200, content={"status": "ok"})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run(app, host="0.0.0.0", port=80)
|
||||
245
gsv_server.py
Normal file
245
gsv_server.py
Normal file
@@ -0,0 +1,245 @@
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
import logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s %(name)-12s %(levelname)-4s %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
level=os.environ.get("LOGLEVEL", "INFO"),
|
||||
)
|
||||
logger = logging.getLogger(__file__)
|
||||
|
||||
|
||||
import torch
|
||||
from torch import Tensor
|
||||
from typing import Optional, List
|
||||
import torch.nn.functional as F
|
||||
|
||||
torch.backends.cuda.enable_flash_sdp(False)
|
||||
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||
torch.backends.cuda.enable_math_sdp(True)
|
||||
|
||||
def custom_conv1d_forward(self, input: Tensor) -> Tensor:
|
||||
if input.dtype == torch.float16 and input.device.type == 'cuda':
|
||||
with torch.amp.autocast(input.device.type, dtype=torch.float):
|
||||
return self._conv_forward(input, self.weight, self.bias).half()
|
||||
else:
|
||||
return self._conv_forward(input, self.weight, self.bias)
|
||||
|
||||
torch.nn.Conv1d.forward = custom_conv1d_forward
|
||||
|
||||
def conv_transpose1d_forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
|
||||
if self.padding_mode != 'zeros':
|
||||
raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d')
|
||||
|
||||
assert isinstance(self.padding, tuple)
|
||||
# One cannot replace List by Tuple or Sequence in "_output_padding" because
|
||||
# TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
|
||||
num_spatial_dims = 1
|
||||
output_padding = self._output_padding(
|
||||
input, output_size, self.stride, self.padding, self.kernel_size, # type: ignore[arg-type]
|
||||
num_spatial_dims, self.dilation) # type: ignore[arg-type]
|
||||
if input.dtype == torch.float and input.device.type == 'cuda':
|
||||
with torch.amp.autocast('cuda', dtype=torch.float16):
|
||||
return F.conv_transpose1d(
|
||||
input, self.weight, self.bias, self.stride, self.padding,
|
||||
output_padding, self.groups, self.dilation).float()
|
||||
else:
|
||||
return F.conv_transpose1d(
|
||||
input, self.weight, self.bias, self.stride, self.padding,
|
||||
output_padding, self.groups, self.dilation)
|
||||
|
||||
torch.nn.ConvTranspose1d.forward = conv_transpose1d_forward
|
||||
|
||||
|
||||
now_dir = os.getcwd()
|
||||
os.chdir(f'{now_dir}/GPT-SoVITS')
|
||||
now_dir = os.getcwd()
|
||||
# sys.path.append(now_dir)
|
||||
sys.path.insert(0, now_dir)
|
||||
sys.path.append("%s/GPT_SoVITS" % (now_dir))
|
||||
|
||||
import sv
|
||||
sv.sv_path = os.path.join(os.getenv("MODEL_DIR", "GPT_SoVITS/pretrained_models"), "sv/pretrained_eres2netv2w24s4ep4.ckpt")
|
||||
|
||||
import subprocess
|
||||
import signal
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
from fastapi import FastAPI, UploadFile, File, Form
|
||||
from fastapi.responses import StreamingResponse, JSONResponse
|
||||
from contextlib import asynccontextmanager
|
||||
import uvicorn
|
||||
from io import BytesIO
|
||||
from tools.i18n.i18n import I18nAuto
|
||||
from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
|
||||
import hashlib
|
||||
from fast_langdetect import detect_language
|
||||
|
||||
model_dir = os.getenv('MODEL_DIR', '/mnt/models/GPT-SoVITS')
|
||||
|
||||
# print(sys.path)
|
||||
i18n = I18nAuto()
|
||||
tts_pipeline = None
|
||||
|
||||
def init():
|
||||
global tts_pipeline
|
||||
|
||||
gsv_config = {
|
||||
# "version": "v2ProPlus",
|
||||
"custom": {
|
||||
"bert_base_path": os.path.join(model_dir, "chinese-roberta-wwm-ext-large"),
|
||||
"cnhuhbert_base_path": os.path.join(model_dir, "chinese-hubert-base"),
|
||||
"device": "cuda",
|
||||
"is_half": False,
|
||||
"t2s_weights_path": os.path.join(model_dir, "s1v3.ckpt"),
|
||||
"version": "v2ProPlus",
|
||||
"vits_weights_path": os.path.join(model_dir, "v2Pro/s2Gv2ProPlus.pth")
|
||||
}
|
||||
}
|
||||
tts_config = TTS_Config(gsv_config)
|
||||
# tts_config = TTS_Config(config_path)
|
||||
tts_pipeline = TTS(tts_config)
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
init()
|
||||
yield
|
||||
pass
|
||||
|
||||
app = FastAPI(lifespan=lifespan)
|
||||
|
||||
### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files
|
||||
def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int):
|
||||
with sf.SoundFile(io_buffer, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file:
|
||||
audio_file.write(data)
|
||||
return io_buffer
|
||||
|
||||
|
||||
def pack_raw(io_buffer: BytesIO, data: np.ndarray, rate: int):
|
||||
io_buffer.write(data.tobytes())
|
||||
return io_buffer
|
||||
|
||||
|
||||
def pack_wav(io_buffer: BytesIO, data: np.ndarray, rate: int):
|
||||
io_buffer = BytesIO()
|
||||
sf.write(io_buffer, data, rate, format="wav")
|
||||
return io_buffer
|
||||
|
||||
|
||||
def pack_aac(io_buffer: BytesIO, data: np.ndarray, rate: int):
|
||||
process = subprocess.Popen(
|
||||
[
|
||||
"ffmpeg",
|
||||
"-f",
|
||||
"s16le", # 输入16位有符号小端整数PCM
|
||||
"-ar",
|
||||
str(rate), # 设置采样率
|
||||
"-ac",
|
||||
"1", # 单声道
|
||||
"-i",
|
||||
"pipe:0", # 从管道读取输入
|
||||
"-c:a",
|
||||
"aac", # 音频编码器为AAC
|
||||
"-b:a",
|
||||
"192k", # 比特率
|
||||
"-vn", # 不包含视频
|
||||
"-f",
|
||||
"adts", # 输出AAC数据流格式
|
||||
"pipe:1", # 将输出写入管道
|
||||
],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
out, _ = process.communicate(input=data.tobytes())
|
||||
io_buffer.write(out)
|
||||
return io_buffer
|
||||
|
||||
|
||||
def pack_audio(io_buffer: BytesIO, data: np.ndarray, rate: int, media_type: str):
|
||||
if media_type == "ogg":
|
||||
io_buffer = pack_ogg(io_buffer, data, rate)
|
||||
elif media_type == "aac":
|
||||
io_buffer = pack_aac(io_buffer, data, rate)
|
||||
elif media_type == "wav":
|
||||
io_buffer = pack_wav(io_buffer, data, rate)
|
||||
else:
|
||||
io_buffer = pack_raw(io_buffer, data, rate)
|
||||
io_buffer.seek(0)
|
||||
return io_buffer
|
||||
|
||||
|
||||
def encode_audio_key(audio_bytes: bytes) -> str:
|
||||
return hashlib.md5(audio_bytes).hexdigest()[:16]
|
||||
|
||||
def tts_generate(gen_text, text_lang="zh", ref_audio=None, ref_text=None):
|
||||
if isinstance(ref_audio, str):
|
||||
ref_audio_path = ref_audio
|
||||
else:
|
||||
audio_key = encode_audio_key(ref_audio)
|
||||
os.makedirs("/workspace/wav", exist_ok=True)
|
||||
if not os.path.exists(f"/workspace/wav/{audio_key}.wav"):
|
||||
with open(f"/workspace/wav/{audio_key}.wav", "wb") as f:
|
||||
f.write(ref_audio)
|
||||
ref_audio_path = f"/workspace/wav/{audio_key}.wav"
|
||||
ref_lang = detect_language(ref_text).lower() if ref_text else text_lang
|
||||
|
||||
req = {
|
||||
"text": gen_text,
|
||||
"text_lang": text_lang,
|
||||
"ref_audio_path": ref_audio_path,
|
||||
"prompt_text": ref_text,
|
||||
"prompt_lang": ref_lang,
|
||||
"text_split_method": "cut2",
|
||||
"media_type": "wav",
|
||||
"speed_factor": 1.0,
|
||||
"parallel_infer": False,
|
||||
"batch_size": 1,
|
||||
"split_bucket": False,
|
||||
"streaming_mode": True
|
||||
}
|
||||
|
||||
streaming_mode = req.get("streaming_mode", False)
|
||||
return_fragment = req.get("return_fragment", False)
|
||||
media_type = req.get("media_type", "wav")
|
||||
|
||||
# check_res = check_params(req)
|
||||
# if check_res is not None:
|
||||
# return check_res
|
||||
|
||||
if streaming_mode or return_fragment:
|
||||
req["return_fragment"] = True
|
||||
|
||||
tts_generator = tts_pipeline.run(req)
|
||||
for sr, chunk in tts_generator:
|
||||
yield pack_audio(BytesIO(), chunk, sr, media_type=None).getvalue()
|
||||
|
||||
# return 32kHz pcm16
|
||||
@app.post("/generate")
|
||||
async def generate(
|
||||
ref_audio: UploadFile = File(...),
|
||||
ref_text: str = Form(...),
|
||||
text: str = Form(...),
|
||||
lang: str = Form("zh")
|
||||
):
|
||||
audio_bytes = await ref_audio.read()
|
||||
return StreamingResponse(
|
||||
tts_generate(text, text_lang=lang, ref_audio=audio_bytes, ref_text=ref_text),
|
||||
media_type="audio/wav"
|
||||
)
|
||||
|
||||
@app.get("/ready")
|
||||
@app.get("/health")
|
||||
async def ready():
|
||||
return JSONResponse(status_code=200, content={"status": "ok"})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
uvicorn.run(app=app, host="0.0.0.0", port=80, workers=1)
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
os.kill(os.getpid(), signal.SIGTERM)
|
||||
exit(0)
|
||||
132
kokoro_server.py
Normal file
132
kokoro_server.py
Normal file
@@ -0,0 +1,132 @@
|
||||
import os
|
||||
|
||||
from fastapi import FastAPI, Body
|
||||
from fastapi.responses import StreamingResponse, JSONResponse
|
||||
from contextlib import asynccontextmanager
|
||||
import uvicorn
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from kokoro import KPipeline, KModel
|
||||
import numpy as np
|
||||
# from scipy.signal import resample
|
||||
|
||||
import torch
|
||||
from torch import Tensor
|
||||
from torch.nn import functional as F
|
||||
from typing import Optional, List
|
||||
|
||||
def conv_transpose1d_forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
|
||||
if self.padding_mode != 'zeros':
|
||||
raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d')
|
||||
|
||||
assert isinstance(self.padding, tuple)
|
||||
# One cannot replace List by Tuple or Sequence in "_output_padding" because
|
||||
# TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
|
||||
num_spatial_dims = 1
|
||||
output_padding = self._output_padding(
|
||||
input, output_size, self.stride, self.padding, self.kernel_size, # type: ignore[arg-type]
|
||||
num_spatial_dims, self.dilation) # type: ignore[arg-type]
|
||||
with torch.amp.autocast('cuda', dtype=torch.float16):
|
||||
return F.conv_transpose1d(
|
||||
input, self.weight, self.bias, self.stride, self.padding,
|
||||
output_padding, self.groups, self.dilation).float()
|
||||
|
||||
torch.nn.ConvTranspose1d.forward = conv_transpose1d_forward
|
||||
|
||||
|
||||
repo_id = 'hexgrad/Kokoro-82M-v1.1-zh'
|
||||
# MODEL_SR = 24000
|
||||
model = None
|
||||
en_empty_pipeline = None
|
||||
en_pipeline = None
|
||||
zh_pipeline = None
|
||||
en_voice_pt = None
|
||||
zh_voice_pt = None
|
||||
en_voice = os.getenv('EN_VOICE', 'af_maple.pt')
|
||||
zh_voice = os.getenv('ZH_VOICE', 'zf_046.pt')
|
||||
model_dir = os.getenv('MODEL_DIR', '/models/hexgrad/Kokoro-82M-v1.1-zh')
|
||||
|
||||
def en_callable(text):
|
||||
if text == 'Kokoro':
|
||||
return 'kˈOkəɹO'
|
||||
elif text == 'Sol':
|
||||
return 'sˈOl'
|
||||
return next(en_empty_pipeline(text)).phonemes
|
||||
|
||||
# HACK: Mitigate rushing caused by lack of training data beyond ~100 tokens
|
||||
# Simple piecewise linear fn that decreases speed as len_ps increases
|
||||
def speed_callable(len_ps):
|
||||
speed = 0.8
|
||||
if len_ps <= 83:
|
||||
speed = 1
|
||||
elif len_ps < 183:
|
||||
speed = 1 - (len_ps - 83) / 500
|
||||
return speed
|
||||
|
||||
# def resample_audio(data: np.ndarray, original_rate: int, target_rate: int):
|
||||
# ori_dtype = data.dtype
|
||||
# # data = normalize_audio(data)
|
||||
# number_of_samples = int(len(data) * float(target_rate) / original_rate)
|
||||
# resampled_data = resample(data, number_of_samples)
|
||||
# # resampled_data = normalize_audio(resampled_data)
|
||||
# return resampled_data.astype(ori_dtype)
|
||||
|
||||
def audio_postprocess(audio: np.ndarray):
|
||||
if audio.dtype == np.float32:
|
||||
audio = np.int16(audio * 32767)
|
||||
return audio
|
||||
|
||||
def init():
|
||||
global model, en_empty_pipeline, en_pipeline, zh_pipeline
|
||||
global en_voice_pt, zh_voice_pt
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
model = KModel(repo_id=repo_id, model=os.path.join(model_dir, 'kokoro-v1_1-zh.pth'), config=os.path.join(model_dir, 'config.json')).to(device).eval()
|
||||
en_empty_pipeline = KPipeline(lang_code='a', repo_id=repo_id, model=False)
|
||||
en_pipeline = KPipeline(lang_code='a', repo_id=repo_id, model=model)
|
||||
zh_pipeline = KPipeline(lang_code='z', repo_id=repo_id, model=model, en_callable=en_callable)
|
||||
en_voice_pt = os.path.join(model_dir, 'voices', en_voice)
|
||||
zh_voice_pt = os.path.join(model_dir, 'voices', zh_voice)
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
init()
|
||||
yield
|
||||
pass
|
||||
|
||||
app = FastAPI(lifespan=lifespan)
|
||||
|
||||
xml_namespace = "{http://www.w3.org/XML/1998/namespace}"
|
||||
|
||||
# return 24kHz pcm-16
|
||||
@app.post("/tts")
|
||||
def generate(ssml: str = Body(...)):
|
||||
try:
|
||||
root = ET.fromstring(ssml)
|
||||
voice_element = root.find(".//voice")
|
||||
if voice_element is not None:
|
||||
text = voice_element.text.strip()
|
||||
language = voice_element.get(f'{xml_namespace}lang', "zh").strip()
|
||||
else:
|
||||
return JSONResponse(status_code=400, content={"message": "Invalid SSML format: <voice> element not found."})
|
||||
except ET.ParseError as e:
|
||||
return JSONResponse(status_code=400, content={"message": "Invalid SSML format", "Exception": str(e)})
|
||||
|
||||
def streaming_generator():
|
||||
if language == 'en':
|
||||
generator = en_pipeline(text=text, voice=en_voice_pt)
|
||||
else:
|
||||
generator = zh_pipeline(text=text, voice=zh_voice_pt, speed=speed_callable)
|
||||
for (_, _, audio) in generator:
|
||||
yield audio_postprocess(audio.numpy()).tobytes()
|
||||
|
||||
return StreamingResponse(streaming_generator(), media_type='audio/wav')
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
@app.get("/ready")
|
||||
async def ready():
|
||||
return JSONResponse(status_code=200, content={"status": "ok"})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run(app, host="0.0.0.0", port=80)
|
||||
3
launch_f5.sh
Executable file
3
launch_f5.sh
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
python3 f5_server.py
|
||||
15
launch_gsv.sh
Executable file
15
launch_gsv.sh
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ -z "$MODEL_DIR" ]; then
|
||||
export MODEL_DIR="/models/GPT-SoVITS"
|
||||
fi
|
||||
|
||||
if [ -z "$NLTK_DATA" ]; then
|
||||
export NLTK_DATA="/models/GPT-SoVITS/nltk_data"
|
||||
fi
|
||||
|
||||
if [ -z "$bert_path" ]; then
|
||||
export bert_path="${MODEL_DIR}/chinese-roberta-wwm-ext-large"
|
||||
fi
|
||||
|
||||
python3 gsv_server.py
|
||||
4
launch_kokoro.sh
Executable file
4
launch_kokoro.sh
Executable file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
python3 kokoro_server.py
|
||||
|
||||
3
requirements_f5.txt
Normal file
3
requirements_f5.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
f5-tts
|
||||
fastapi
|
||||
uvicorn[standard]
|
||||
5
requirements_kokoro.txt
Normal file
5
requirements_kokoro.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
kokoro>=0.8.2
|
||||
misaki[zh]>=0.8.2
|
||||
soundfile
|
||||
fastapi
|
||||
uvicorn[standard]
|
||||
Reference in New Issue
Block a user