From 05a77ed283877fa3ed12e713466157cc69fc0750 Mon Sep 17 00:00:00 2001 From: zhousha <736730048@qq.com> Date: Tue, 12 Aug 2025 14:15:41 +0800 Subject: [PATCH] merge code repo for f5 and gpt and kokoro --- .DS_Store | Bin 0 -> 6148 bytes Dockerfile_f5 | 7 ++ Dockerfile_gsv | 10 ++ Dockerfile_kokoro | 9 ++ README.md | 5 + constraints_f5.txt | 1 + constraints_gsv.txt | 1 + constraints_kokoro.txt | 1 + f5_server.py | 133 ++++++++++++++++++++++ gsv_server.py | 245 ++++++++++++++++++++++++++++++++++++++++ kokoro_server.py | 132 ++++++++++++++++++++++ launch_f5.sh | 3 + launch_gsv.sh | 15 +++ launch_kokoro.sh | 4 + requirements_f5.txt | 3 + requirements_kokoro.txt | 5 + 16 files changed, 574 insertions(+) create mode 100644 .DS_Store create mode 100644 Dockerfile_f5 create mode 100644 Dockerfile_gsv create mode 100644 Dockerfile_kokoro create mode 100644 README.md create mode 100644 constraints_f5.txt create mode 100644 constraints_gsv.txt create mode 100644 constraints_kokoro.txt create mode 100644 f5_server.py create mode 100644 gsv_server.py create mode 100644 kokoro_server.py create mode 100755 launch_f5.sh create mode 100755 launch_gsv.sh create mode 100755 launch_kokoro.sh create mode 100644 requirements_f5.txt create mode 100644 requirements_kokoro.txt diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..39566dad8ade06b196960b793c1d556fbebf11bf GIT binary patch literal 6148 zcmeHKJxc>Y5PhR5f;K5F_YYY350;Q1c0z)+Xv)A9VtaNP&N)fGsw=&6=-Ny>;?(-fJ8EmF_j)bT_Vp!VvA4810xF fZ^w60ly%M5Jnw~LV$hinI#E9Zu8T|x{Ivons7w?R literal 0 HcmV?d00001 diff --git a/Dockerfile_f5 b/Dockerfile_f5 new file mode 100644 index 0000000..b566f9f --- /dev/null +++ b/Dockerfile_f5 @@ -0,0 +1,7 @@ +FROM corex:3.2.1 + +WORKDIR /workspace +COPY requirements_f5.txt constraints_f5.txt f5_server.py launch_f5.sh /workspace/ +RUN pip install -r requirements_f5.txt -c constraints_f5.txt + +ENTRYPOINT ["/bin/bash", "launch_f5.sh"] diff --git a/Dockerfile_gsv b/Dockerfile_gsv new file mode 100644 index 0000000..527d8fe --- /dev/null +++ b/Dockerfile_gsv @@ -0,0 +1,10 @@ +FROM corex:3.2.1 + +WORKDIR /workspace +COPY GPT-SoVITS constraints_gsv.txt gsv_server.py launch_gsv.sh /workspace/ +RUN pip install -r GPT-SOVITS/extra-req.txt --no-deps \ + && pip install -r GPT-SoVITS/requirements.txt -c constraints_gsv.txt \ + && apt update \ + && apt install -y ffmpeg libsox-dev + +ENTRYPOINT ["/bin/bash", "launch_gsv.sh"] diff --git a/Dockerfile_kokoro b/Dockerfile_kokoro new file mode 100644 index 0000000..a120899 --- /dev/null +++ b/Dockerfile_kokoro @@ -0,0 +1,9 @@ +FROM corex:3.2.1 + +WORKDIR /workspace +COPY requirements_kokoro.txt constraints_kokoro.txt kokoro_server.py launch_kokoro.sh /workspace/ +RUN pip install -r requirements_kokoro.txt -c constraints_kokoro.txt \ + && apt update \ + && apt install -y espeak-ng + +ENTRYPOINT ["/bin/bash", "launch_kokoro.sh"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..1a24abf --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# tiangai100-f5-tts +# tiangai100-kokoro-tts +# tiangai100-gpt-sovits + +【语音合成】 \ No newline at end of file diff --git a/constraints_f5.txt b/constraints_f5.txt new file mode 100644 index 0000000..65d66ac --- /dev/null +++ b/constraints_f5.txt @@ -0,0 +1 @@ +torch==2.1.0+corex.3.2.1 \ No newline at end of file diff --git a/constraints_gsv.txt b/constraints_gsv.txt new file mode 100644 index 0000000..65d66ac --- /dev/null +++ b/constraints_gsv.txt @@ -0,0 +1 @@ +torch==2.1.0+corex.3.2.1 \ No newline at end of file diff --git a/constraints_kokoro.txt b/constraints_kokoro.txt new file mode 100644 index 0000000..65d66ac --- /dev/null +++ b/constraints_kokoro.txt @@ -0,0 +1 @@ +torch==2.1.0+corex.3.2.1 \ No newline at end of file diff --git a/f5_server.py b/f5_server.py new file mode 100644 index 0000000..906f990 --- /dev/null +++ b/f5_server.py @@ -0,0 +1,133 @@ +import torch + +torch.backends.cuda.enable_flash_sdp(False) +torch.backends.cuda.enable_mem_efficient_sdp(False) +torch.backends.cuda.enable_math_sdp(True) + +from torch import Tensor +from typing import Optional, List +import torch.nn.functional as F + +# def custom_conv1d_forward(self, input: Tensor, debug=False) -> Tensor: +# with torch.amp.autocast(input.device.type, dtype=torch.float): +# return self._conv_forward(input, self.weight, self.bias) + +# torch.nn.Conv1d.forward = custom_conv1d_forward + +def conv_transpose1d_forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor: + if self.padding_mode != 'zeros': + raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d') + + assert isinstance(self.padding, tuple) + # One cannot replace List by Tuple or Sequence in "_output_padding" because + # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. + num_spatial_dims = 1 + output_padding = self._output_padding( + input, output_size, self.stride, self.padding, self.kernel_size, # type: ignore[arg-type] + num_spatial_dims, self.dilation) # type: ignore[arg-type] + with torch.amp.autocast('cuda', dtype=torch.float16): + return F.conv_transpose1d( + input, self.weight, self.bias, self.stride, self.padding, + output_padding, self.groups, self.dilation).float() + +torch.nn.ConvTranspose1d.forward = conv_transpose1d_forward + +from f5_tts.infer.utils_infer import ( + load_vocoder, + load_model, + chunk_text, + infer_batch_process, +) +from omegaconf import OmegaConf +from hydra.utils import get_class +import torchaudio +import io + +from fastapi import FastAPI +from fastapi import UploadFile, File, Form +from fastapi.responses import StreamingResponse, JSONResponse +from contextlib import asynccontextmanager +import uvicorn +import os + +import logging +logging.basicConfig( + format="%(asctime)s %(name)-12s %(levelname)-4s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=os.environ.get("LOGLEVEL", "INFO"), +) +logger = logging.getLogger(__file__) + + +model_dir = os.getenv('MODEL_DIR', '/models/SWivid/F5-TTS') +vocoder_dir = os.getenv('VOCODER_DIR', '/models/charactr/vocos-mel-24khz') + +ema_model = None +vocoder = None +device = 'cuda' if torch.cuda.is_available() else 'cpu' + +def init(): + global ema_model, vocoder + # load vocoder + vocoder_name = 'vocos' + vocoder = load_vocoder(vocoder_name=vocoder_name, is_local=True, local_path=vocoder_dir, device=device) + + # load TTS model + model_cfg = OmegaConf.load('/workspace/F5-TTS/src/f5_tts/configs/F5TTS_v1_Base.yaml') + model_cls = get_class(f'f5_tts.model.{model_cfg.model.backbone}') + model_arc = model_cfg.model.arch + ckpt_file = os.path.join(model_dir, 'F5TTS_v1_Base/model_1250000.safetensors') + vocab_file = os.path.join(model_dir, 'F5TTS_v1_Base/vocab.txt') + ema_model = load_model( + model_cls, model_arc, ckpt_file, mel_spec_type=vocoder_name, vocab_file=vocab_file, device=device + ) + +@asynccontextmanager +async def lifespan(app: FastAPI): + init() + yield + pass + +app = FastAPI(lifespan=lifespan) + +def tts_generate(gen_text, ref_audio, ref_text): + global ema_model, vocoder + + audio, sr = torchaudio.load(io.BytesIO(ref_audio)) + max_chars = min(int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (22 - audio.shape[-1] / sr)), 135) + gen_text_batches = chunk_text(gen_text, max_chars=max_chars) + for gen_audio, gen_sr in infer_batch_process( + (audio, sr), + ref_text, + gen_text_batches, + ema_model, + vocoder, + device=device, + streaming=True, + chunk_size=int(24e6), + # nfe_step=16, + ): + yield gen_audio.tobytes() + +# return 24kHz pcm16 +@app.post("/generate") +async def generate( + ref_audio: UploadFile = File(...), + ref_text: str = Form(...), + text: str = Form(...) +): + audio_bytes = await ref_audio.read() + return StreamingResponse( + tts_generate(text, ref_audio=audio_bytes, ref_text=ref_text), + media_type="audio/wav" + ) + + +@app.get("/ready") +@app.get("/health") +async def ready(): + return JSONResponse(status_code=200, content={"status": "ok"}) + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=80) diff --git a/gsv_server.py b/gsv_server.py new file mode 100644 index 0000000..57ab574 --- /dev/null +++ b/gsv_server.py @@ -0,0 +1,245 @@ +import os +import sys +import traceback + +import logging +logging.basicConfig( + format="%(asctime)s %(name)-12s %(levelname)-4s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=os.environ.get("LOGLEVEL", "INFO"), +) +logger = logging.getLogger(__file__) + + +import torch +from torch import Tensor +from typing import Optional, List +import torch.nn.functional as F + +torch.backends.cuda.enable_flash_sdp(False) +torch.backends.cuda.enable_mem_efficient_sdp(False) +torch.backends.cuda.enable_math_sdp(True) + +def custom_conv1d_forward(self, input: Tensor) -> Tensor: + if input.dtype == torch.float16 and input.device.type == 'cuda': + with torch.amp.autocast(input.device.type, dtype=torch.float): + return self._conv_forward(input, self.weight, self.bias).half() + else: + return self._conv_forward(input, self.weight, self.bias) + +torch.nn.Conv1d.forward = custom_conv1d_forward + +def conv_transpose1d_forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor: + if self.padding_mode != 'zeros': + raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d') + + assert isinstance(self.padding, tuple) + # One cannot replace List by Tuple or Sequence in "_output_padding" because + # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. + num_spatial_dims = 1 + output_padding = self._output_padding( + input, output_size, self.stride, self.padding, self.kernel_size, # type: ignore[arg-type] + num_spatial_dims, self.dilation) # type: ignore[arg-type] + if input.dtype == torch.float and input.device.type == 'cuda': + with torch.amp.autocast('cuda', dtype=torch.float16): + return F.conv_transpose1d( + input, self.weight, self.bias, self.stride, self.padding, + output_padding, self.groups, self.dilation).float() + else: + return F.conv_transpose1d( + input, self.weight, self.bias, self.stride, self.padding, + output_padding, self.groups, self.dilation) + +torch.nn.ConvTranspose1d.forward = conv_transpose1d_forward + + +now_dir = os.getcwd() +os.chdir(f'{now_dir}/GPT-SoVITS') +now_dir = os.getcwd() +# sys.path.append(now_dir) +sys.path.insert(0, now_dir) +sys.path.append("%s/GPT_SoVITS" % (now_dir)) + +import sv +sv.sv_path = os.path.join(os.getenv("MODEL_DIR", "GPT_SoVITS/pretrained_models"), "sv/pretrained_eres2netv2w24s4ep4.ckpt") + +import subprocess +import signal +import numpy as np +import soundfile as sf +from fastapi import FastAPI, UploadFile, File, Form +from fastapi.responses import StreamingResponse, JSONResponse +from contextlib import asynccontextmanager +import uvicorn +from io import BytesIO +from tools.i18n.i18n import I18nAuto +from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config +import hashlib +from fast_langdetect import detect_language + +model_dir = os.getenv('MODEL_DIR', '/mnt/models/GPT-SoVITS') + +# print(sys.path) +i18n = I18nAuto() +tts_pipeline = None + +def init(): + global tts_pipeline + + gsv_config = { + # "version": "v2ProPlus", + "custom": { + "bert_base_path": os.path.join(model_dir, "chinese-roberta-wwm-ext-large"), + "cnhuhbert_base_path": os.path.join(model_dir, "chinese-hubert-base"), + "device": "cuda", + "is_half": False, + "t2s_weights_path": os.path.join(model_dir, "s1v3.ckpt"), + "version": "v2ProPlus", + "vits_weights_path": os.path.join(model_dir, "v2Pro/s2Gv2ProPlus.pth") + } + } + tts_config = TTS_Config(gsv_config) + # tts_config = TTS_Config(config_path) + tts_pipeline = TTS(tts_config) + +@asynccontextmanager +async def lifespan(app: FastAPI): + init() + yield + pass + +app = FastAPI(lifespan=lifespan) + +### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files +def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int): + with sf.SoundFile(io_buffer, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file: + audio_file.write(data) + return io_buffer + + +def pack_raw(io_buffer: BytesIO, data: np.ndarray, rate: int): + io_buffer.write(data.tobytes()) + return io_buffer + + +def pack_wav(io_buffer: BytesIO, data: np.ndarray, rate: int): + io_buffer = BytesIO() + sf.write(io_buffer, data, rate, format="wav") + return io_buffer + + +def pack_aac(io_buffer: BytesIO, data: np.ndarray, rate: int): + process = subprocess.Popen( + [ + "ffmpeg", + "-f", + "s16le", # 输入16位有符号小端整数PCM + "-ar", + str(rate), # 设置采样率 + "-ac", + "1", # 单声道 + "-i", + "pipe:0", # 从管道读取输入 + "-c:a", + "aac", # 音频编码器为AAC + "-b:a", + "192k", # 比特率 + "-vn", # 不包含视频 + "-f", + "adts", # 输出AAC数据流格式 + "pipe:1", # 将输出写入管道 + ], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + out, _ = process.communicate(input=data.tobytes()) + io_buffer.write(out) + return io_buffer + + +def pack_audio(io_buffer: BytesIO, data: np.ndarray, rate: int, media_type: str): + if media_type == "ogg": + io_buffer = pack_ogg(io_buffer, data, rate) + elif media_type == "aac": + io_buffer = pack_aac(io_buffer, data, rate) + elif media_type == "wav": + io_buffer = pack_wav(io_buffer, data, rate) + else: + io_buffer = pack_raw(io_buffer, data, rate) + io_buffer.seek(0) + return io_buffer + + +def encode_audio_key(audio_bytes: bytes) -> str: + return hashlib.md5(audio_bytes).hexdigest()[:16] + +def tts_generate(gen_text, text_lang="zh", ref_audio=None, ref_text=None): + if isinstance(ref_audio, str): + ref_audio_path = ref_audio + else: + audio_key = encode_audio_key(ref_audio) + os.makedirs("/workspace/wav", exist_ok=True) + if not os.path.exists(f"/workspace/wav/{audio_key}.wav"): + with open(f"/workspace/wav/{audio_key}.wav", "wb") as f: + f.write(ref_audio) + ref_audio_path = f"/workspace/wav/{audio_key}.wav" + ref_lang = detect_language(ref_text).lower() if ref_text else text_lang + + req = { + "text": gen_text, + "text_lang": text_lang, + "ref_audio_path": ref_audio_path, + "prompt_text": ref_text, + "prompt_lang": ref_lang, + "text_split_method": "cut2", + "media_type": "wav", + "speed_factor": 1.0, + "parallel_infer": False, + "batch_size": 1, + "split_bucket": False, + "streaming_mode": True + } + + streaming_mode = req.get("streaming_mode", False) + return_fragment = req.get("return_fragment", False) + media_type = req.get("media_type", "wav") + + # check_res = check_params(req) + # if check_res is not None: + # return check_res + + if streaming_mode or return_fragment: + req["return_fragment"] = True + + tts_generator = tts_pipeline.run(req) + for sr, chunk in tts_generator: + yield pack_audio(BytesIO(), chunk, sr, media_type=None).getvalue() + +# return 32kHz pcm16 +@app.post("/generate") +async def generate( + ref_audio: UploadFile = File(...), + ref_text: str = Form(...), + text: str = Form(...), + lang: str = Form("zh") +): + audio_bytes = await ref_audio.read() + return StreamingResponse( + tts_generate(text, text_lang=lang, ref_audio=audio_bytes, ref_text=ref_text), + media_type="audio/wav" + ) + +@app.get("/ready") +@app.get("/health") +async def ready(): + return JSONResponse(status_code=200, content={"status": "ok"}) + + +if __name__ == "__main__": + try: + uvicorn.run(app=app, host="0.0.0.0", port=80, workers=1) + except Exception: + traceback.print_exc() + os.kill(os.getpid(), signal.SIGTERM) + exit(0) diff --git a/kokoro_server.py b/kokoro_server.py new file mode 100644 index 0000000..8602998 --- /dev/null +++ b/kokoro_server.py @@ -0,0 +1,132 @@ +import os + +from fastapi import FastAPI, Body +from fastapi.responses import StreamingResponse, JSONResponse +from contextlib import asynccontextmanager +import uvicorn +import xml.etree.ElementTree as ET + +from kokoro import KPipeline, KModel +import numpy as np +# from scipy.signal import resample + +import torch +from torch import Tensor +from torch.nn import functional as F +from typing import Optional, List + +def conv_transpose1d_forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor: + if self.padding_mode != 'zeros': + raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d') + + assert isinstance(self.padding, tuple) + # One cannot replace List by Tuple or Sequence in "_output_padding" because + # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. + num_spatial_dims = 1 + output_padding = self._output_padding( + input, output_size, self.stride, self.padding, self.kernel_size, # type: ignore[arg-type] + num_spatial_dims, self.dilation) # type: ignore[arg-type] + with torch.amp.autocast('cuda', dtype=torch.float16): + return F.conv_transpose1d( + input, self.weight, self.bias, self.stride, self.padding, + output_padding, self.groups, self.dilation).float() + +torch.nn.ConvTranspose1d.forward = conv_transpose1d_forward + + +repo_id = 'hexgrad/Kokoro-82M-v1.1-zh' +# MODEL_SR = 24000 +model = None +en_empty_pipeline = None +en_pipeline = None +zh_pipeline = None +en_voice_pt = None +zh_voice_pt = None +en_voice = os.getenv('EN_VOICE', 'af_maple.pt') +zh_voice = os.getenv('ZH_VOICE', 'zf_046.pt') +model_dir = os.getenv('MODEL_DIR', '/models/hexgrad/Kokoro-82M-v1.1-zh') + +def en_callable(text): + if text == 'Kokoro': + return 'kˈOkəɹO' + elif text == 'Sol': + return 'sˈOl' + return next(en_empty_pipeline(text)).phonemes + +# HACK: Mitigate rushing caused by lack of training data beyond ~100 tokens +# Simple piecewise linear fn that decreases speed as len_ps increases +def speed_callable(len_ps): + speed = 0.8 + if len_ps <= 83: + speed = 1 + elif len_ps < 183: + speed = 1 - (len_ps - 83) / 500 + return speed + +# def resample_audio(data: np.ndarray, original_rate: int, target_rate: int): +# ori_dtype = data.dtype +# # data = normalize_audio(data) +# number_of_samples = int(len(data) * float(target_rate) / original_rate) +# resampled_data = resample(data, number_of_samples) +# # resampled_data = normalize_audio(resampled_data) +# return resampled_data.astype(ori_dtype) + +def audio_postprocess(audio: np.ndarray): + if audio.dtype == np.float32: + audio = np.int16(audio * 32767) + return audio + +def init(): + global model, en_empty_pipeline, en_pipeline, zh_pipeline + global en_voice_pt, zh_voice_pt + device = 'cuda' if torch.cuda.is_available() else 'cpu' + model = KModel(repo_id=repo_id, model=os.path.join(model_dir, 'kokoro-v1_1-zh.pth'), config=os.path.join(model_dir, 'config.json')).to(device).eval() + en_empty_pipeline = KPipeline(lang_code='a', repo_id=repo_id, model=False) + en_pipeline = KPipeline(lang_code='a', repo_id=repo_id, model=model) + zh_pipeline = KPipeline(lang_code='z', repo_id=repo_id, model=model, en_callable=en_callable) + en_voice_pt = os.path.join(model_dir, 'voices', en_voice) + zh_voice_pt = os.path.join(model_dir, 'voices', zh_voice) + +@asynccontextmanager +async def lifespan(app: FastAPI): + init() + yield + pass + +app = FastAPI(lifespan=lifespan) + +xml_namespace = "{http://www.w3.org/XML/1998/namespace}" + +# return 24kHz pcm-16 +@app.post("/tts") +def generate(ssml: str = Body(...)): + try: + root = ET.fromstring(ssml) + voice_element = root.find(".//voice") + if voice_element is not None: + text = voice_element.text.strip() + language = voice_element.get(f'{xml_namespace}lang', "zh").strip() + else: + return JSONResponse(status_code=400, content={"message": "Invalid SSML format: element not found."}) + except ET.ParseError as e: + return JSONResponse(status_code=400, content={"message": "Invalid SSML format", "Exception": str(e)}) + + def streaming_generator(): + if language == 'en': + generator = en_pipeline(text=text, voice=en_voice_pt) + else: + generator = zh_pipeline(text=text, voice=zh_voice_pt, speed=speed_callable) + for (_, _, audio) in generator: + yield audio_postprocess(audio.numpy()).tobytes() + + return StreamingResponse(streaming_generator(), media_type='audio/wav') + + +@app.get("/health") +@app.get("/ready") +async def ready(): + return JSONResponse(status_code=200, content={"status": "ok"}) + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=80) diff --git a/launch_f5.sh b/launch_f5.sh new file mode 100755 index 0000000..c4561a8 --- /dev/null +++ b/launch_f5.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +python3 f5_server.py diff --git a/launch_gsv.sh b/launch_gsv.sh new file mode 100755 index 0000000..6340aad --- /dev/null +++ b/launch_gsv.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +if [ -z "$MODEL_DIR" ]; then + export MODEL_DIR="/models/GPT-SoVITS" +fi + +if [ -z "$NLTK_DATA" ]; then + export NLTK_DATA="/models/GPT-SoVITS/nltk_data" +fi + +if [ -z "$bert_path" ]; then + export bert_path="${MODEL_DIR}/chinese-roberta-wwm-ext-large" +fi + +python3 gsv_server.py diff --git a/launch_kokoro.sh b/launch_kokoro.sh new file mode 100755 index 0000000..7dfbcd9 --- /dev/null +++ b/launch_kokoro.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +python3 kokoro_server.py + diff --git a/requirements_f5.txt b/requirements_f5.txt new file mode 100644 index 0000000..1fb3a78 --- /dev/null +++ b/requirements_f5.txt @@ -0,0 +1,3 @@ +f5-tts +fastapi +uvicorn[standard] \ No newline at end of file diff --git a/requirements_kokoro.txt b/requirements_kokoro.txt new file mode 100644 index 0000000..69bc8c2 --- /dev/null +++ b/requirements_kokoro.txt @@ -0,0 +1,5 @@ +kokoro>=0.8.2 +misaki[zh]>=0.8.2 +soundfile +fastapi +uvicorn[standard] \ No newline at end of file