merge code repo for f5 and gpt and kokoro

2025-08-12 14:15:41 +08:00
commit 05a77ed283
16 changed files with 574 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/7
+++ b/7
@@ -0,0 +1,7 @@
 FROM corex:3.2.1
 WORKDIR /workspace
 COPY requirements_f5.txt constraints_f5.txt f5_server.py launch_f5.sh /workspace/
 RUN pip install -r requirements_f5.txt -c constraints_f5.txt
 ENTRYPOINT ["/bin/bash", "launch_f5.sh"]
--- a/10
+++ b/10
@@ -0,0 +1,10 @@
 FROM corex:3.2.1
 WORKDIR /workspace
 COPY GPT-SoVITS constraints_gsv.txt gsv_server.py launch_gsv.sh /workspace/
 RUN pip install -r GPT-SOVITS/extra-req.txt --no-deps \
    && pip install -r GPT-SoVITS/requirements.txt -c constraints_gsv.txt \
    && apt update \
    && apt install -y ffmpeg libsox-dev
 ENTRYPOINT ["/bin/bash", "launch_gsv.sh"]
--- a/9
+++ b/9
@@ -0,0 +1,9 @@
 FROM corex:3.2.1
 WORKDIR /workspace
 COPY requirements_kokoro.txt constraints_kokoro.txt kokoro_server.py launch_kokoro.sh /workspace/
 RUN pip install -r requirements_kokoro.txt -c constraints_kokoro.txt \
    && apt update \
    && apt install -y espeak-ng
 ENTRYPOINT ["/bin/bash", "launch_kokoro.sh"]
--- a/README.md
+++ b/README.md
@@ -0,0 +1,5 @@
 # tiangai100-f5-tts
 # tiangai100-kokoro-tts
 # tiangai100-gpt-sovits
 【语音合成】
--- a/constraints_f5.txt
+++ b/constraints_f5.txt
@@ -0,0 +1 @@
 torch==2.1.0+corex.3.2.1
--- a/constraints_gsv.txt
+++ b/constraints_gsv.txt
@@ -0,0 +1 @@
 torch==2.1.0+corex.3.2.1
--- a/constraints_kokoro.txt
+++ b/constraints_kokoro.txt
@@ -0,0 +1 @@
 torch==2.1.0+corex.3.2.1
--- a/f5_server.py
+++ b/f5_server.py
@@ -0,0 +1,133 @@
 import torch
 torch.backends.cuda.enable_flash_sdp(False)
 torch.backends.cuda.enable_mem_efficient_sdp(False)
 torch.backends.cuda.enable_math_sdp(True)
 from torch import Tensor
 from typing import Optional, List
 import torch.nn.functional as F
 # def custom_conv1d_forward(self, input: Tensor, debug=False) -> Tensor:
 #     with torch.amp.autocast(input.device.type, dtype=torch.float):
 #         return self._conv_forward(input, self.weight, self.bias)
 # torch.nn.Conv1d.forward = custom_conv1d_forward
 def conv_transpose1d_forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
    if self.padding_mode != 'zeros':
        raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d')
    assert isinstance(self.padding, tuple)
    # One cannot replace List by Tuple or Sequence in "_output_padding" because
    # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
    num_spatial_dims = 1
    output_padding = self._output_padding(
        input, output_size, self.stride, self.padding, self.kernel_size,  # type: ignore[arg-type]
        num_spatial_dims, self.dilation)  # type: ignore[arg-type]
    with torch.amp.autocast('cuda', dtype=torch.float16):
        return F.conv_transpose1d(
            input, self.weight, self.bias, self.stride, self.padding,
            output_padding, self.groups, self.dilation).float()
 torch.nn.ConvTranspose1d.forward = conv_transpose1d_forward
 from f5_tts.infer.utils_infer import (
    load_vocoder,
    load_model,
    chunk_text,
    infer_batch_process,
 )
 from omegaconf import OmegaConf
 from hydra.utils import get_class
 import torchaudio
 import io
 from fastapi import FastAPI
 from fastapi import UploadFile, File, Form
 from fastapi.responses import StreamingResponse, JSONResponse
 from contextlib import asynccontextmanager
 import uvicorn
 import os
 import logging
 logging.basicConfig(
    format="%(asctime)s %(name)-12s %(levelname)-4s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=os.environ.get("LOGLEVEL", "INFO"),
 )
 logger = logging.getLogger(__file__)
 model_dir = os.getenv('MODEL_DIR', '/models/SWivid/F5-TTS')
 vocoder_dir = os.getenv('VOCODER_DIR', '/models/charactr/vocos-mel-24khz')
 ema_model = None
 vocoder = None
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 def init():
    global ema_model, vocoder
    # load vocoder
    vocoder_name = 'vocos'
    vocoder = load_vocoder(vocoder_name=vocoder_name, is_local=True, local_path=vocoder_dir, device=device)
    # load TTS model
    model_cfg = OmegaConf.load('/workspace/F5-TTS/src/f5_tts/configs/F5TTS_v1_Base.yaml')
    model_cls = get_class(f'f5_tts.model.{model_cfg.model.backbone}')
    model_arc = model_cfg.model.arch
    ckpt_file = os.path.join(model_dir, 'F5TTS_v1_Base/model_1250000.safetensors')
    vocab_file = os.path.join(model_dir, 'F5TTS_v1_Base/vocab.txt')
    ema_model = load_model(
        model_cls, model_arc, ckpt_file, mel_spec_type=vocoder_name, vocab_file=vocab_file, device=device
    )
@asynccontextmanager
 async def lifespan(app: FastAPI):
    init()
    yield
    pass
 app = FastAPI(lifespan=lifespan)
 def tts_generate(gen_text, ref_audio, ref_text):
    global ema_model, vocoder
    audio, sr = torchaudio.load(io.BytesIO(ref_audio))
    max_chars = min(int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (22 - audio.shape[-1] / sr)), 135)
    gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
    for gen_audio, gen_sr in infer_batch_process(
        (audio, sr),
        ref_text,
        gen_text_batches,
        ema_model,
        vocoder,
        device=device,
        streaming=True,
        chunk_size=int(24e6),
        # nfe_step=16,
    ):
        yield gen_audio.tobytes()
 # return 24kHz pcm16
@app.post("/generate")
 async def generate(
    ref_audio: UploadFile = File(...),
    ref_text: str = Form(...),
    text: str = Form(...)
 ):
    audio_bytes = await ref_audio.read()
    return StreamingResponse(
        tts_generate(text, ref_audio=audio_bytes, ref_text=ref_text),
        media_type="audio/wav"
    )
@app.get("/ready")
@app.get("/health")
 async def ready():
    return JSONResponse(status_code=200, content={"status": "ok"})
 if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=80)
--- a/gsv_server.py
+++ b/gsv_server.py
@@ -0,0 +1,245 @@
 import os
 import sys
 import traceback
 import logging
 logging.basicConfig(
    format="%(asctime)s %(name)-12s %(levelname)-4s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=os.environ.get("LOGLEVEL", "INFO"),
 )
 logger = logging.getLogger(__file__)
 import torch
 from torch import Tensor
 from typing import Optional, List
 import torch.nn.functional as F
 torch.backends.cuda.enable_flash_sdp(False)
 torch.backends.cuda.enable_mem_efficient_sdp(False)
 torch.backends.cuda.enable_math_sdp(True)
 def custom_conv1d_forward(self, input: Tensor) -> Tensor:
    if input.dtype == torch.float16 and input.device.type == 'cuda':
        with torch.amp.autocast(input.device.type, dtype=torch.float):
            return self._conv_forward(input, self.weight, self.bias).half()
    else:
        return self._conv_forward(input, self.weight, self.bias)
 torch.nn.Conv1d.forward = custom_conv1d_forward
 def conv_transpose1d_forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
    if self.padding_mode != 'zeros':
        raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d')
    assert isinstance(self.padding, tuple)
    # One cannot replace List by Tuple or Sequence in "_output_padding" because
    # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
    num_spatial_dims = 1
    output_padding = self._output_padding(
        input, output_size, self.stride, self.padding, self.kernel_size,  # type: ignore[arg-type]
        num_spatial_dims, self.dilation)  # type: ignore[arg-type]
    if input.dtype == torch.float and input.device.type == 'cuda':
        with torch.amp.autocast('cuda', dtype=torch.float16):
            return F.conv_transpose1d(
                input, self.weight, self.bias, self.stride, self.padding,
                output_padding, self.groups, self.dilation).float()
    else:
        return F.conv_transpose1d(
            input, self.weight, self.bias, self.stride, self.padding,
            output_padding, self.groups, self.dilation)
 torch.nn.ConvTranspose1d.forward = conv_transpose1d_forward
 now_dir = os.getcwd()
 os.chdir(f'{now_dir}/GPT-SoVITS')
 now_dir = os.getcwd()
 # sys.path.append(now_dir)
 sys.path.insert(0, now_dir)
 sys.path.append("%s/GPT_SoVITS" % (now_dir))
 import sv
 sv.sv_path = os.path.join(os.getenv("MODEL_DIR", "GPT_SoVITS/pretrained_models"), "sv/pretrained_eres2netv2w24s4ep4.ckpt")
 import subprocess
 import signal
 import numpy as np
 import soundfile as sf
 from fastapi import FastAPI, UploadFile, File, Form
 from fastapi.responses import StreamingResponse, JSONResponse
 from contextlib import asynccontextmanager
 import uvicorn
 from io import BytesIO
 from tools.i18n.i18n import I18nAuto
 from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
 import hashlib
 from fast_langdetect import detect_language
 model_dir = os.getenv('MODEL_DIR', '/mnt/models/GPT-SoVITS')
 # print(sys.path)
 i18n = I18nAuto()
 tts_pipeline = None
 def init():
    global tts_pipeline
    gsv_config = {
        # "version": "v2ProPlus",
        "custom": {
            "bert_base_path": os.path.join(model_dir, "chinese-roberta-wwm-ext-large"),
            "cnhuhbert_base_path": os.path.join(model_dir, "chinese-hubert-base"),
            "device": "cuda",
            "is_half": False,
            "t2s_weights_path": os.path.join(model_dir, "s1v3.ckpt"),
            "version": "v2ProPlus",
            "vits_weights_path": os.path.join(model_dir, "v2Pro/s2Gv2ProPlus.pth")
        }
    }
    tts_config = TTS_Config(gsv_config)
    # tts_config = TTS_Config(config_path)
    tts_pipeline = TTS(tts_config)
@asynccontextmanager
 async def lifespan(app: FastAPI):
    init()
    yield
    pass
 app = FastAPI(lifespan=lifespan)
 ### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files
 def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int):
    with sf.SoundFile(io_buffer, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file:
        audio_file.write(data)
    return io_buffer
 def pack_raw(io_buffer: BytesIO, data: np.ndarray, rate: int):
    io_buffer.write(data.tobytes())
    return io_buffer
 def pack_wav(io_buffer: BytesIO, data: np.ndarray, rate: int):
    io_buffer = BytesIO()
    sf.write(io_buffer, data, rate, format="wav")
    return io_buffer
 def pack_aac(io_buffer: BytesIO, data: np.ndarray, rate: int):
    process = subprocess.Popen(
        [
            "ffmpeg",
            "-f",
            "s16le",  # 输入16位有符号小端整数PCM
            "-ar",
            str(rate),  # 设置采样率
            "-ac",
            "1",  # 单声道
            "-i",
            "pipe:0",  # 从管道读取输入
            "-c:a",
            "aac",  # 音频编码器为AAC
            "-b:a",
            "192k",  # 比特率
            "-vn",  # 不包含视频
            "-f",
            "adts",  # 输出AAC数据流格式
            "pipe:1",  # 将输出写入管道
        ],
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    out, _ = process.communicate(input=data.tobytes())
    io_buffer.write(out)
    return io_buffer
 def pack_audio(io_buffer: BytesIO, data: np.ndarray, rate: int, media_type: str):
    if media_type == "ogg":
        io_buffer = pack_ogg(io_buffer, data, rate)
    elif media_type == "aac":
        io_buffer = pack_aac(io_buffer, data, rate)
    elif media_type == "wav":
        io_buffer = pack_wav(io_buffer, data, rate)
    else:
        io_buffer = pack_raw(io_buffer, data, rate)
    io_buffer.seek(0)
    return io_buffer
 def encode_audio_key(audio_bytes: bytes) -> str:
    return hashlib.md5(audio_bytes).hexdigest()[:16]
 def tts_generate(gen_text, text_lang="zh", ref_audio=None, ref_text=None):
    if isinstance(ref_audio, str):
        ref_audio_path = ref_audio
    else:
        audio_key = encode_audio_key(ref_audio)
        os.makedirs("/workspace/wav", exist_ok=True)
        if not os.path.exists(f"/workspace/wav/{audio_key}.wav"):
            with open(f"/workspace/wav/{audio_key}.wav", "wb") as f:
                f.write(ref_audio)
        ref_audio_path = f"/workspace/wav/{audio_key}.wav"
    ref_lang = detect_language(ref_text).lower() if ref_text else text_lang
    req = {
        "text": gen_text,
        "text_lang": text_lang,
        "ref_audio_path": ref_audio_path,
        "prompt_text": ref_text,
        "prompt_lang": ref_lang,
        "text_split_method": "cut2",
        "media_type": "wav",
        "speed_factor": 1.0,
        "parallel_infer": False,
        "batch_size": 1,
        "split_bucket": False,
        "streaming_mode": True
    }
    streaming_mode = req.get("streaming_mode", False)
    return_fragment = req.get("return_fragment", False)
    media_type = req.get("media_type", "wav")
    # check_res = check_params(req)
    # if check_res is not None:
    #     return check_res
    if streaming_mode or return_fragment:
        req["return_fragment"] = True
    tts_generator = tts_pipeline.run(req)
    for sr, chunk in tts_generator:
        yield pack_audio(BytesIO(), chunk, sr, media_type=None).getvalue()
 # return 32kHz pcm16
@app.post("/generate")
 async def generate(
    ref_audio: UploadFile = File(...),
    ref_text: str = Form(...),
    text: str = Form(...),
    lang: str = Form("zh")
 ):
    audio_bytes = await ref_audio.read()
    return StreamingResponse(
        tts_generate(text, text_lang=lang, ref_audio=audio_bytes, ref_text=ref_text),
        media_type="audio/wav"
    )
@app.get("/ready")
@app.get("/health")
 async def ready():
    return JSONResponse(status_code=200, content={"status": "ok"})
 if __name__ == "__main__":
    try:
        uvicorn.run(app=app, host="0.0.0.0", port=80, workers=1)
    except Exception:
        traceback.print_exc()
        os.kill(os.getpid(), signal.SIGTERM)
        exit(0)
--- a/kokoro_server.py
+++ b/kokoro_server.py
@@ -0,0 +1,132 @@
 import os
 from fastapi import FastAPI, Body
 from fastapi.responses import StreamingResponse, JSONResponse
 from contextlib import asynccontextmanager
 import uvicorn
 import xml.etree.ElementTree as ET
 from kokoro import KPipeline, KModel
 import numpy as np
 # from scipy.signal import resample
 import torch
 from torch import Tensor
 from torch.nn import functional as F
 from typing import Optional, List
 def conv_transpose1d_forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
    if self.padding_mode != 'zeros':
        raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d')
    assert isinstance(self.padding, tuple)
    # One cannot replace List by Tuple or Sequence in "_output_padding" because
    # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
    num_spatial_dims = 1
    output_padding = self._output_padding(
        input, output_size, self.stride, self.padding, self.kernel_size,  # type: ignore[arg-type]
        num_spatial_dims, self.dilation)  # type: ignore[arg-type]
    with torch.amp.autocast('cuda', dtype=torch.float16):
        return F.conv_transpose1d(
            input, self.weight, self.bias, self.stride, self.padding,
            output_padding, self.groups, self.dilation).float()
 torch.nn.ConvTranspose1d.forward = conv_transpose1d_forward
 repo_id = 'hexgrad/Kokoro-82M-v1.1-zh'
 # MODEL_SR = 24000
 model = None
 en_empty_pipeline = None
 en_pipeline = None
 zh_pipeline = None
 en_voice_pt = None
 zh_voice_pt = None
 en_voice = os.getenv('EN_VOICE', 'af_maple.pt')
 zh_voice = os.getenv('ZH_VOICE', 'zf_046.pt')
 model_dir = os.getenv('MODEL_DIR', '/models/hexgrad/Kokoro-82M-v1.1-zh')
 def en_callable(text):
    if text == 'Kokoro':
        return 'kˈOkəɹO'
    elif text == 'Sol':
        return 'sˈOl'
    return next(en_empty_pipeline(text)).phonemes
 # HACK: Mitigate rushing caused by lack of training data beyond ~100 tokens
 # Simple piecewise linear fn that decreases speed as len_ps increases
 def speed_callable(len_ps):
    speed = 0.8
    if len_ps <= 83:
        speed = 1
    elif len_ps < 183:
        speed = 1 - (len_ps - 83) / 500
    return speed
 # def resample_audio(data: np.ndarray, original_rate: int, target_rate: int):
 #     ori_dtype = data.dtype
 #     # data = normalize_audio(data)
 #     number_of_samples = int(len(data) * float(target_rate) / original_rate)
 #     resampled_data = resample(data, number_of_samples)
 #     # resampled_data = normalize_audio(resampled_data)
 #     return resampled_data.astype(ori_dtype)
 def audio_postprocess(audio: np.ndarray):
    if audio.dtype == np.float32:
        audio = np.int16(audio * 32767)
    return audio
 def init():
    global model, en_empty_pipeline, en_pipeline, zh_pipeline
    global en_voice_pt, zh_voice_pt
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = KModel(repo_id=repo_id, model=os.path.join(model_dir, 'kokoro-v1_1-zh.pth'), config=os.path.join(model_dir, 'config.json')).to(device).eval()
    en_empty_pipeline = KPipeline(lang_code='a', repo_id=repo_id, model=False)
    en_pipeline = KPipeline(lang_code='a', repo_id=repo_id, model=model)
    zh_pipeline = KPipeline(lang_code='z', repo_id=repo_id, model=model, en_callable=en_callable)
    en_voice_pt = os.path.join(model_dir, 'voices', en_voice)
    zh_voice_pt = os.path.join(model_dir, 'voices', zh_voice)
@asynccontextmanager
 async def lifespan(app: FastAPI):
    init()
    yield
    pass
 app = FastAPI(lifespan=lifespan)
 xml_namespace = "{http://www.w3.org/XML/1998/namespace}"
 # return 24kHz pcm-16
@app.post("/tts")
 def generate(ssml: str = Body(...)):
    try:
        root = ET.fromstring(ssml)
        voice_element = root.find(".//voice")
        if voice_element is not None:
            text = voice_element.text.strip()
            language = voice_element.get(f'{xml_namespace}lang', "zh").strip()
        else:
            return JSONResponse(status_code=400, content={"message": "Invalid SSML format: <voice> element not found."})
    except ET.ParseError as e:
        return JSONResponse(status_code=400, content={"message": "Invalid SSML format", "Exception": str(e)})
    def streaming_generator():
        if language == 'en':
            generator = en_pipeline(text=text, voice=en_voice_pt)
        else:
            generator = zh_pipeline(text=text, voice=zh_voice_pt, speed=speed_callable)
        for (_, _, audio) in generator:
            yield audio_postprocess(audio.numpy()).tobytes()
    return StreamingResponse(streaming_generator(), media_type='audio/wav')
@app.get("/health")
@app.get("/ready")
 async def ready():
    return JSONResponse(status_code=200, content={"status": "ok"})
 if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=80)
--- a/launch_f5.sh
+++ b/launch_f5.sh
@@ -0,0 +1,3 @@
 #!/bin/bash
 python3 f5_server.py
--- a/launch_gsv.sh
+++ b/launch_gsv.sh
@@ -0,0 +1,15 @@
 #!/bin/bash
 if [ -z "$MODEL_DIR" ]; then
  export MODEL_DIR="/models/GPT-SoVITS"
 fi
 if [ -z "$NLTK_DATA" ]; then
  export NLTK_DATA="/models/GPT-SoVITS/nltk_data"
 fi
 if [ -z "$bert_path" ]; then
  export bert_path="${MODEL_DIR}/chinese-roberta-wwm-ext-large"
 fi
 python3 gsv_server.py
--- a/launch_kokoro.sh
+++ b/launch_kokoro.sh
@@ -0,0 +1,4 @@
 #!/bin/bash
 python3 kokoro_server.py
--- a/requirements_f5.txt
+++ b/requirements_f5.txt
@@ -0,0 +1,3 @@
 f5-tts
 fastapi
 uvicorn[standard]
--- a/requirements_kokoro.txt
+++ b/requirements_kokoro.txt
@@ -0,0 +1,5 @@
 kokoro>=0.8.2
 misaki[zh]>=0.8.2
 soundfile
 fastapi
 uvicorn[standard]