merge code repo for f5 and gpt and kokoro

This commit is contained in:
zhousha
2025-08-12 14:15:41 +08:00
commit 05a77ed283
16 changed files with 574 additions and 0 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

7
Dockerfile_f5 Normal file
View File

@@ -0,0 +1,7 @@
FROM corex:3.2.1
WORKDIR /workspace
COPY requirements_f5.txt constraints_f5.txt f5_server.py launch_f5.sh /workspace/
RUN pip install -r requirements_f5.txt -c constraints_f5.txt
ENTRYPOINT ["/bin/bash", "launch_f5.sh"]

10
Dockerfile_gsv Normal file
View File

@@ -0,0 +1,10 @@
FROM corex:3.2.1
WORKDIR /workspace
COPY GPT-SoVITS constraints_gsv.txt gsv_server.py launch_gsv.sh /workspace/
RUN pip install -r GPT-SOVITS/extra-req.txt --no-deps \
&& pip install -r GPT-SoVITS/requirements.txt -c constraints_gsv.txt \
&& apt update \
&& apt install -y ffmpeg libsox-dev
ENTRYPOINT ["/bin/bash", "launch_gsv.sh"]

9
Dockerfile_kokoro Normal file
View File

@@ -0,0 +1,9 @@
FROM corex:3.2.1
WORKDIR /workspace
COPY requirements_kokoro.txt constraints_kokoro.txt kokoro_server.py launch_kokoro.sh /workspace/
RUN pip install -r requirements_kokoro.txt -c constraints_kokoro.txt \
&& apt update \
&& apt install -y espeak-ng
ENTRYPOINT ["/bin/bash", "launch_kokoro.sh"]

5
README.md Normal file
View File

@@ -0,0 +1,5 @@
# tiangai100-f5-tts
# tiangai100-kokoro-tts
# tiangai100-gpt-sovits
【语音合成】

1
constraints_f5.txt Normal file
View File

@@ -0,0 +1 @@
torch==2.1.0+corex.3.2.1

1
constraints_gsv.txt Normal file
View File

@@ -0,0 +1 @@
torch==2.1.0+corex.3.2.1

1
constraints_kokoro.txt Normal file
View File

@@ -0,0 +1 @@
torch==2.1.0+corex.3.2.1

133
f5_server.py Normal file
View File

@@ -0,0 +1,133 @@
import torch
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)
from torch import Tensor
from typing import Optional, List
import torch.nn.functional as F
# def custom_conv1d_forward(self, input: Tensor, debug=False) -> Tensor:
# with torch.amp.autocast(input.device.type, dtype=torch.float):
# return self._conv_forward(input, self.weight, self.bias)
# torch.nn.Conv1d.forward = custom_conv1d_forward
def conv_transpose1d_forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
if self.padding_mode != 'zeros':
raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d')
assert isinstance(self.padding, tuple)
# One cannot replace List by Tuple or Sequence in "_output_padding" because
# TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
num_spatial_dims = 1
output_padding = self._output_padding(
input, output_size, self.stride, self.padding, self.kernel_size, # type: ignore[arg-type]
num_spatial_dims, self.dilation) # type: ignore[arg-type]
with torch.amp.autocast('cuda', dtype=torch.float16):
return F.conv_transpose1d(
input, self.weight, self.bias, self.stride, self.padding,
output_padding, self.groups, self.dilation).float()
torch.nn.ConvTranspose1d.forward = conv_transpose1d_forward
from f5_tts.infer.utils_infer import (
load_vocoder,
load_model,
chunk_text,
infer_batch_process,
)
from omegaconf import OmegaConf
from hydra.utils import get_class
import torchaudio
import io
from fastapi import FastAPI
from fastapi import UploadFile, File, Form
from fastapi.responses import StreamingResponse, JSONResponse
from contextlib import asynccontextmanager
import uvicorn
import os
import logging
logging.basicConfig(
format="%(asctime)s %(name)-12s %(levelname)-4s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO"),
)
logger = logging.getLogger(__file__)
model_dir = os.getenv('MODEL_DIR', '/models/SWivid/F5-TTS')
vocoder_dir = os.getenv('VOCODER_DIR', '/models/charactr/vocos-mel-24khz')
ema_model = None
vocoder = None
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def init():
global ema_model, vocoder
# load vocoder
vocoder_name = 'vocos'
vocoder = load_vocoder(vocoder_name=vocoder_name, is_local=True, local_path=vocoder_dir, device=device)
# load TTS model
model_cfg = OmegaConf.load('/workspace/F5-TTS/src/f5_tts/configs/F5TTS_v1_Base.yaml')
model_cls = get_class(f'f5_tts.model.{model_cfg.model.backbone}')
model_arc = model_cfg.model.arch
ckpt_file = os.path.join(model_dir, 'F5TTS_v1_Base/model_1250000.safetensors')
vocab_file = os.path.join(model_dir, 'F5TTS_v1_Base/vocab.txt')
ema_model = load_model(
model_cls, model_arc, ckpt_file, mel_spec_type=vocoder_name, vocab_file=vocab_file, device=device
)
@asynccontextmanager
async def lifespan(app: FastAPI):
init()
yield
pass
app = FastAPI(lifespan=lifespan)
def tts_generate(gen_text, ref_audio, ref_text):
global ema_model, vocoder
audio, sr = torchaudio.load(io.BytesIO(ref_audio))
max_chars = min(int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (22 - audio.shape[-1] / sr)), 135)
gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
for gen_audio, gen_sr in infer_batch_process(
(audio, sr),
ref_text,
gen_text_batches,
ema_model,
vocoder,
device=device,
streaming=True,
chunk_size=int(24e6),
# nfe_step=16,
):
yield gen_audio.tobytes()
# return 24kHz pcm16
@app.post("/generate")
async def generate(
ref_audio: UploadFile = File(...),
ref_text: str = Form(...),
text: str = Form(...)
):
audio_bytes = await ref_audio.read()
return StreamingResponse(
tts_generate(text, ref_audio=audio_bytes, ref_text=ref_text),
media_type="audio/wav"
)
@app.get("/ready")
@app.get("/health")
async def ready():
return JSONResponse(status_code=200, content={"status": "ok"})
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=80)

245
gsv_server.py Normal file
View File

@@ -0,0 +1,245 @@
import os
import sys
import traceback
import logging
logging.basicConfig(
format="%(asctime)s %(name)-12s %(levelname)-4s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO"),
)
logger = logging.getLogger(__file__)
import torch
from torch import Tensor
from typing import Optional, List
import torch.nn.functional as F
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)
def custom_conv1d_forward(self, input: Tensor) -> Tensor:
if input.dtype == torch.float16 and input.device.type == 'cuda':
with torch.amp.autocast(input.device.type, dtype=torch.float):
return self._conv_forward(input, self.weight, self.bias).half()
else:
return self._conv_forward(input, self.weight, self.bias)
torch.nn.Conv1d.forward = custom_conv1d_forward
def conv_transpose1d_forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
if self.padding_mode != 'zeros':
raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d')
assert isinstance(self.padding, tuple)
# One cannot replace List by Tuple or Sequence in "_output_padding" because
# TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
num_spatial_dims = 1
output_padding = self._output_padding(
input, output_size, self.stride, self.padding, self.kernel_size, # type: ignore[arg-type]
num_spatial_dims, self.dilation) # type: ignore[arg-type]
if input.dtype == torch.float and input.device.type == 'cuda':
with torch.amp.autocast('cuda', dtype=torch.float16):
return F.conv_transpose1d(
input, self.weight, self.bias, self.stride, self.padding,
output_padding, self.groups, self.dilation).float()
else:
return F.conv_transpose1d(
input, self.weight, self.bias, self.stride, self.padding,
output_padding, self.groups, self.dilation)
torch.nn.ConvTranspose1d.forward = conv_transpose1d_forward
now_dir = os.getcwd()
os.chdir(f'{now_dir}/GPT-SoVITS')
now_dir = os.getcwd()
# sys.path.append(now_dir)
sys.path.insert(0, now_dir)
sys.path.append("%s/GPT_SoVITS" % (now_dir))
import sv
sv.sv_path = os.path.join(os.getenv("MODEL_DIR", "GPT_SoVITS/pretrained_models"), "sv/pretrained_eres2netv2w24s4ep4.ckpt")
import subprocess
import signal
import numpy as np
import soundfile as sf
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import StreamingResponse, JSONResponse
from contextlib import asynccontextmanager
import uvicorn
from io import BytesIO
from tools.i18n.i18n import I18nAuto
from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
import hashlib
from fast_langdetect import detect_language
model_dir = os.getenv('MODEL_DIR', '/mnt/models/GPT-SoVITS')
# print(sys.path)
i18n = I18nAuto()
tts_pipeline = None
def init():
global tts_pipeline
gsv_config = {
# "version": "v2ProPlus",
"custom": {
"bert_base_path": os.path.join(model_dir, "chinese-roberta-wwm-ext-large"),
"cnhuhbert_base_path": os.path.join(model_dir, "chinese-hubert-base"),
"device": "cuda",
"is_half": False,
"t2s_weights_path": os.path.join(model_dir, "s1v3.ckpt"),
"version": "v2ProPlus",
"vits_weights_path": os.path.join(model_dir, "v2Pro/s2Gv2ProPlus.pth")
}
}
tts_config = TTS_Config(gsv_config)
# tts_config = TTS_Config(config_path)
tts_pipeline = TTS(tts_config)
@asynccontextmanager
async def lifespan(app: FastAPI):
init()
yield
pass
app = FastAPI(lifespan=lifespan)
### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files
def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int):
with sf.SoundFile(io_buffer, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file:
audio_file.write(data)
return io_buffer
def pack_raw(io_buffer: BytesIO, data: np.ndarray, rate: int):
io_buffer.write(data.tobytes())
return io_buffer
def pack_wav(io_buffer: BytesIO, data: np.ndarray, rate: int):
io_buffer = BytesIO()
sf.write(io_buffer, data, rate, format="wav")
return io_buffer
def pack_aac(io_buffer: BytesIO, data: np.ndarray, rate: int):
process = subprocess.Popen(
[
"ffmpeg",
"-f",
"s16le", # 输入16位有符号小端整数PCM
"-ar",
str(rate), # 设置采样率
"-ac",
"1", # 单声道
"-i",
"pipe:0", # 从管道读取输入
"-c:a",
"aac", # 音频编码器为AAC
"-b:a",
"192k", # 比特率
"-vn", # 不包含视频
"-f",
"adts", # 输出AAC数据流格式
"pipe:1", # 将输出写入管道
],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
out, _ = process.communicate(input=data.tobytes())
io_buffer.write(out)
return io_buffer
def pack_audio(io_buffer: BytesIO, data: np.ndarray, rate: int, media_type: str):
if media_type == "ogg":
io_buffer = pack_ogg(io_buffer, data, rate)
elif media_type == "aac":
io_buffer = pack_aac(io_buffer, data, rate)
elif media_type == "wav":
io_buffer = pack_wav(io_buffer, data, rate)
else:
io_buffer = pack_raw(io_buffer, data, rate)
io_buffer.seek(0)
return io_buffer
def encode_audio_key(audio_bytes: bytes) -> str:
return hashlib.md5(audio_bytes).hexdigest()[:16]
def tts_generate(gen_text, text_lang="zh", ref_audio=None, ref_text=None):
if isinstance(ref_audio, str):
ref_audio_path = ref_audio
else:
audio_key = encode_audio_key(ref_audio)
os.makedirs("/workspace/wav", exist_ok=True)
if not os.path.exists(f"/workspace/wav/{audio_key}.wav"):
with open(f"/workspace/wav/{audio_key}.wav", "wb") as f:
f.write(ref_audio)
ref_audio_path = f"/workspace/wav/{audio_key}.wav"
ref_lang = detect_language(ref_text).lower() if ref_text else text_lang
req = {
"text": gen_text,
"text_lang": text_lang,
"ref_audio_path": ref_audio_path,
"prompt_text": ref_text,
"prompt_lang": ref_lang,
"text_split_method": "cut2",
"media_type": "wav",
"speed_factor": 1.0,
"parallel_infer": False,
"batch_size": 1,
"split_bucket": False,
"streaming_mode": True
}
streaming_mode = req.get("streaming_mode", False)
return_fragment = req.get("return_fragment", False)
media_type = req.get("media_type", "wav")
# check_res = check_params(req)
# if check_res is not None:
# return check_res
if streaming_mode or return_fragment:
req["return_fragment"] = True
tts_generator = tts_pipeline.run(req)
for sr, chunk in tts_generator:
yield pack_audio(BytesIO(), chunk, sr, media_type=None).getvalue()
# return 32kHz pcm16
@app.post("/generate")
async def generate(
ref_audio: UploadFile = File(...),
ref_text: str = Form(...),
text: str = Form(...),
lang: str = Form("zh")
):
audio_bytes = await ref_audio.read()
return StreamingResponse(
tts_generate(text, text_lang=lang, ref_audio=audio_bytes, ref_text=ref_text),
media_type="audio/wav"
)
@app.get("/ready")
@app.get("/health")
async def ready():
return JSONResponse(status_code=200, content={"status": "ok"})
if __name__ == "__main__":
try:
uvicorn.run(app=app, host="0.0.0.0", port=80, workers=1)
except Exception:
traceback.print_exc()
os.kill(os.getpid(), signal.SIGTERM)
exit(0)

132
kokoro_server.py Normal file
View File

@@ -0,0 +1,132 @@
import os
from fastapi import FastAPI, Body
from fastapi.responses import StreamingResponse, JSONResponse
from contextlib import asynccontextmanager
import uvicorn
import xml.etree.ElementTree as ET
from kokoro import KPipeline, KModel
import numpy as np
# from scipy.signal import resample
import torch
from torch import Tensor
from torch.nn import functional as F
from typing import Optional, List
def conv_transpose1d_forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
if self.padding_mode != 'zeros':
raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d')
assert isinstance(self.padding, tuple)
# One cannot replace List by Tuple or Sequence in "_output_padding" because
# TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
num_spatial_dims = 1
output_padding = self._output_padding(
input, output_size, self.stride, self.padding, self.kernel_size, # type: ignore[arg-type]
num_spatial_dims, self.dilation) # type: ignore[arg-type]
with torch.amp.autocast('cuda', dtype=torch.float16):
return F.conv_transpose1d(
input, self.weight, self.bias, self.stride, self.padding,
output_padding, self.groups, self.dilation).float()
torch.nn.ConvTranspose1d.forward = conv_transpose1d_forward
repo_id = 'hexgrad/Kokoro-82M-v1.1-zh'
# MODEL_SR = 24000
model = None
en_empty_pipeline = None
en_pipeline = None
zh_pipeline = None
en_voice_pt = None
zh_voice_pt = None
en_voice = os.getenv('EN_VOICE', 'af_maple.pt')
zh_voice = os.getenv('ZH_VOICE', 'zf_046.pt')
model_dir = os.getenv('MODEL_DIR', '/models/hexgrad/Kokoro-82M-v1.1-zh')
def en_callable(text):
if text == 'Kokoro':
return 'kˈOkəɹO'
elif text == 'Sol':
return 'sˈOl'
return next(en_empty_pipeline(text)).phonemes
# HACK: Mitigate rushing caused by lack of training data beyond ~100 tokens
# Simple piecewise linear fn that decreases speed as len_ps increases
def speed_callable(len_ps):
speed = 0.8
if len_ps <= 83:
speed = 1
elif len_ps < 183:
speed = 1 - (len_ps - 83) / 500
return speed
# def resample_audio(data: np.ndarray, original_rate: int, target_rate: int):
# ori_dtype = data.dtype
# # data = normalize_audio(data)
# number_of_samples = int(len(data) * float(target_rate) / original_rate)
# resampled_data = resample(data, number_of_samples)
# # resampled_data = normalize_audio(resampled_data)
# return resampled_data.astype(ori_dtype)
def audio_postprocess(audio: np.ndarray):
if audio.dtype == np.float32:
audio = np.int16(audio * 32767)
return audio
def init():
global model, en_empty_pipeline, en_pipeline, zh_pipeline
global en_voice_pt, zh_voice_pt
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = KModel(repo_id=repo_id, model=os.path.join(model_dir, 'kokoro-v1_1-zh.pth'), config=os.path.join(model_dir, 'config.json')).to(device).eval()
en_empty_pipeline = KPipeline(lang_code='a', repo_id=repo_id, model=False)
en_pipeline = KPipeline(lang_code='a', repo_id=repo_id, model=model)
zh_pipeline = KPipeline(lang_code='z', repo_id=repo_id, model=model, en_callable=en_callable)
en_voice_pt = os.path.join(model_dir, 'voices', en_voice)
zh_voice_pt = os.path.join(model_dir, 'voices', zh_voice)
@asynccontextmanager
async def lifespan(app: FastAPI):
init()
yield
pass
app = FastAPI(lifespan=lifespan)
xml_namespace = "{http://www.w3.org/XML/1998/namespace}"
# return 24kHz pcm-16
@app.post("/tts")
def generate(ssml: str = Body(...)):
try:
root = ET.fromstring(ssml)
voice_element = root.find(".//voice")
if voice_element is not None:
text = voice_element.text.strip()
language = voice_element.get(f'{xml_namespace}lang', "zh").strip()
else:
return JSONResponse(status_code=400, content={"message": "Invalid SSML format: <voice> element not found."})
except ET.ParseError as e:
return JSONResponse(status_code=400, content={"message": "Invalid SSML format", "Exception": str(e)})
def streaming_generator():
if language == 'en':
generator = en_pipeline(text=text, voice=en_voice_pt)
else:
generator = zh_pipeline(text=text, voice=zh_voice_pt, speed=speed_callable)
for (_, _, audio) in generator:
yield audio_postprocess(audio.numpy()).tobytes()
return StreamingResponse(streaming_generator(), media_type='audio/wav')
@app.get("/health")
@app.get("/ready")
async def ready():
return JSONResponse(status_code=200, content={"status": "ok"})
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=80)

3
launch_f5.sh Executable file
View File

@@ -0,0 +1,3 @@
#!/bin/bash
python3 f5_server.py

15
launch_gsv.sh Executable file
View File

@@ -0,0 +1,15 @@
#!/bin/bash
if [ -z "$MODEL_DIR" ]; then
export MODEL_DIR="/models/GPT-SoVITS"
fi
if [ -z "$NLTK_DATA" ]; then
export NLTK_DATA="/models/GPT-SoVITS/nltk_data"
fi
if [ -z "$bert_path" ]; then
export bert_path="${MODEL_DIR}/chinese-roberta-wwm-ext-large"
fi
python3 gsv_server.py

4
launch_kokoro.sh Executable file
View File

@@ -0,0 +1,4 @@
#!/bin/bash
python3 kokoro_server.py

3
requirements_f5.txt Normal file
View File

@@ -0,0 +1,3 @@
f5-tts
fastapi
uvicorn[standard]

5
requirements_kokoro.txt Normal file
View File

@@ -0,0 +1,5 @@
kokoro>=0.8.2
misaki[zh]>=0.8.2
soundfile
fastapi
uvicorn[standard]