update README

2025-09-10 10:47:02 +08:00
parent 5088f0b50a
commit ff78032400
603 changed files with 21 additions and 23 deletions
--- a/mlu_370-f5-tts/f5_server.py
+++ b/mlu_370-f5-tts/f5_server.py
@@ -0,0 +1,316 @@
+import os
+model_dir = os.getenv("MODEL_DIR", "/mounted_model")
+model_name = os.getenv("MODEL_NAME", "model.safetensors")
+
+import logging
+logging.basicConfig(
+    format="%(asctime)s %(name)-12s %(levelname)-4s %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO"),
+)
+logger = logging.getLogger(__file__)
+
+
+import torch
+torch.set_num_threads(4)
+
+from torch import Tensor
+from typing import Optional, List
+import torch.nn.functional as F
+
+
+from f5_tts.infer.utils_infer import (
+    load_vocoder,
+    load_model,
+    preprocess_ref_audio_text,
+    infer_process,
+    infer_batch_process,
+)
+from omegaconf import OmegaConf
+from hydra.utils import get_class
+import torch
+import re
+import numpy as np
+import soundfile as sf
+import torchaudio
+from scipy import signal
+import io
+import time
+
+from fastapi import FastAPI, Request, Response, Body, HTTPException
+from fastapi import UploadFile, File, Form
+from fastapi.responses import StreamingResponse, JSONResponse
+from contextlib import asynccontextmanager
+import uvicorn
+import os
+import hashlib
+import xml.etree.ElementTree as ET
+from typing import Union
+
+vocoder_dir = os.getenv('VOCODER_DIR', '/app/charactr/vocos-mel-24khz')
+speed = float(os.getenv('SPEED', 1.0))
+
+ema_model = None
+vocoder = None
+voice_dict = {}
+# device = 'cuda' if torch.cuda.is_available() else 'cpu'
+device = 'mlu'
+TARGET_SR = 16000
+N_ZEROS = 20
+
+# ===== New: AMP dtype config (default: bf16; set AMP_DTYPE=fp16 to force fp16) =====
+# AMP_DTYPE_ENV = os.getenv("AMP_DTYPE", "bf16").lower()
+# def _amp_dtype_for_mlu():
+#     return torch.float16 if AMP_DTYPE_ENV in ("fp16", "float16", "16") else torch.bfloat16
+# def mlu_autocast():
+#     # torch.autocast 支持 device_type="mlu"
+#     return torch.autocast(device_type="mlu", dtype=_amp_dtype_for_mlu())
+
+# std_ref_audio_file = os.path.join(model_dir, 'ref_audio.wav')
+# std_ref_text_file = os.path.join(model_dir, 'ref_text.txt')
+std_ref_audio_file = '/app/ref_audio.wav'
+std_ref_text_file = '/app/ref_text.txt'
+std_ref_audio = None
+std_ref_text = None
+
+def init():
+    global ema_model, vocoder
+    global std_ref_audio, std_ref_text
+    logger.info(f'{device=}')
+    # load vocoder
+    vocoder_name = 'vocos'
+    vocoder = load_vocoder(vocoder_name=vocoder_name, is_local=True, local_path=vocoder_dir, device=device)
+
+    # load TTS model
+    model_cfg = OmegaConf.load('/app/F5-TTS/src/f5_tts/configs/F5TTS_v1_Base.yaml')
+    model_cls = get_class(f'f5_tts.model.{model_cfg.model.backbone}')
+    model_arc = model_cfg.model.arch
+    ckpt_file = os.path.join(model_dir, model_name)
+    vocab_file = os.path.join(model_dir, 'vocab.txt')
+    ema_model = load_model(
+        model_cls, model_arc, ckpt_file, mel_spec_type=vocoder_name, vocab_file=vocab_file, device=device
+    )
+    try:
+        ema_model.eval()
+    except Exception:
+        pass
+    with open(std_ref_audio_file, 'rb') as f:
+        std_ref_audio = f.read()
+    with open(std_ref_text_file, 'r', encoding='utf-8') as f:
+        std_ref_text = f.read().strip()
+
+    
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    init()
+    yield
+    pass
+
+app = FastAPI(lifespan=lifespan)
+
+@app.get("/health")
+@app.get("/ready")
+async def ready():
+    return JSONResponse(status_code=200, content={"message": "success"})
+
+def encode_audio_key(audio_bytes: bytes) -> str:
+    return hashlib.md5(audio_bytes[:16000]).hexdigest()[:16]
+
+@app.post("/register_voice")
+async def register_voice(
+    audio: UploadFile = File(...),
+    text: str = Form(...)
+):
+    global voice_dict
+
+    audio_bytes = await audio.read()
+    audio_key = encode_audio_key(audio_bytes)
+    # Ensure ref_text ends with a proper sentence-ending punctuation
+    if not text.endswith(". ") and not text.endswith("。"):
+        if text.endswith("."):
+            text += " "
+        else:
+            text += ". "
+    voice_dict[audio_key] = {
+        'ref_audio': audio_bytes,
+        'ref_text': text.strip()
+    }
+
+    # warmup
+    for _ in generate("流式语音合成，合成测试", audio_key, fast_infer=2):
+        logger.info("Warming up")
+
+    response = {
+        "status": "success",
+        "audio_key": audio_key
+    }
+    return JSONResponse(status_code=200, content=response)
+
+
+symbols = """,.!?;:()[]{}<>，。！？；：【】《》……'"’“”_—"""
+def contains_words(text):
+    return any(char not in symbols for char in text)
+
+def split_text(text, max_chars=135, cut_short_first=False):
+    sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[；：，。！？])", text)
+    sentences = [s.strip() for s in sentences if s.strip()]
+
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars:
+            current_chunk += sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
+        else:
+            if current_chunk and contains_words(current_chunk):
+                chunks.append(current_chunk.strip())
+            current_chunk = sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
+
+    if current_chunk and contains_words(current_chunk):
+        chunks.append(current_chunk.strip())
+
+    if cut_short_first:
+        first_sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[；：，。！？])", chunks[0])
+        first = first_sentences[0].strip()
+        rest = "".join(first_sentences[1:]).strip()
+        first_chunk = [first, rest] if rest else [first]
+        final_chunks = first_chunk + chunks[1:]
+    else:
+        final_chunks = chunks
+
+    return final_chunks
+
+def audio_postprocess(audio: np.ndarray, ori_sr: int, target_sr: int) -> np.ndarray:
+    number_of_samples = int(len(audio) * float(target_sr) / ori_sr)
+    audio_resampled = signal.resample(audio, number_of_samples)
+    if audio.dtype == np.float32:
+        audio_resampled = np.clip(audio_resampled, -1.0, 1.0)
+        audio_resampled = (audio_resampled * 32767).astype(np.int16)
+    return audio_resampled
+
+
+def generate(gen_text, ref_audio_key, fast_infer=0):
+    global voice_dict, ema_model, vocoder
+    ref_audio_ = voice_dict[ref_audio_key]['ref_audio']
+    ref_text_ = voice_dict[ref_audio_key]['ref_text']
+
+    nfe_step = 16
+    if fast_infer >= 1:
+        nfe_step = 7
+        # nonuniform_step = True
+    # if fast_infer >= 2:
+    #     ref_audio_ = voice_dict[ref_audio_key].get('ref_audio_slice', ref_audio_)
+    #     ref_text_ = voice_dict[ref_audio_key].get('ref_text_slice', ref_text_)
+
+    audio, sr = torchaudio.load(io.BytesIO(ref_audio_))
+    max_chars = int(len(ref_text_.encode("utf-8")) / (audio.shape[-1] / sr) * (22 - audio.shape[-1] / sr))
+    gen_text_batches = split_text(gen_text, max_chars=max_chars, cut_short_first=(fast_infer > 0))
+
+    for gen_audio, gen_sr in infer_batch_process(
+        (audio, sr),
+        ref_text_,
+        gen_text_batches,
+        ema_model,
+        vocoder,
+        device=device,
+        streaming=True,
+        chunk_size=int(24e6),
+        nfe_step=nfe_step,
+        speed=speed,
+    ):
+        yield audio_postprocess(gen_audio, gen_sr, TARGET_SR).tobytes()
+
+def generate_with_audio(gen_text, ref_audio, ref_text, fast_infer=0):
+    global ema_model, vocoder
+
+    if not contains_words(gen_text):
+        audio = np.zeros(N_ZEROS, dtype=np.int16).tobytes()
+        yield audio
+        return
+
+    nfe_step = 16
+    if fast_infer >= 1:
+        nfe_step = 7
+
+    audio, sr = torchaudio.load(io.BytesIO(ref_audio))
+    max_chars = min(int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (22 - audio.shape[-1] / sr)), 135)
+    gen_text_batches = split_text(gen_text, max_chars=max_chars, cut_short_first=(fast_infer > 0))
+    
+    for gen_audio, gen_sr in infer_batch_process(
+        (audio, sr),
+        ref_text,
+        gen_text_batches,
+        ema_model,
+        vocoder,
+        device=device,
+        streaming=True,
+        chunk_size=int(24e6),
+        nfe_step=nfe_step,
+        speed=speed,
+    ):
+        yield audio_postprocess(gen_audio, gen_sr, TARGET_SR).tobytes()
+
+
+@app.post("/synthesize")
+async def synthesize(request: Request):
+    data = await request.json()
+    text = data['text']
+    audio_key = data['audio_key']
+    fast_infer = data.get('fast_infer', 0)
+    if fast_infer == True:
+        fast_infer = 2
+    else:
+        fast_infer = int(fast_infer)
+    # logger.info(f"Synthesizing text: {text}, audio_key: {audio_key}, fast_infer: {fast_infer}")
+
+    if not contains_words(text):
+        audio = np.zeros(N_ZEROS, dtype=np.int16).tobytes()
+        return Response(audio, media_type='audio/wav')
+
+    global voice_dict
+    if audio_key not in voice_dict:
+        raise HTTPException(status_code=400, detail="Invalid audio key")
+
+    return StreamingResponse(generate(text, audio_key, fast_infer), media_type="audio/wav")
+
+
+xml_namespace = "{http://www.w3.org/XML/1998/namespace}"
+@app.post("/tts")
+def predict(ssml: str = Body(...), fast_infer: Union[bool, int] = 0):
+    try:
+        root = ET.fromstring(ssml)
+        voice_element = root.find(".//voice")
+        if voice_element is not None:
+            transcription = voice_element.text.strip()
+            language = voice_element.get(f'{xml_namespace}lang', "zh").strip()
+            # voice_name = voice_element.get("name", "zh-f-soft-1").strip()
+        else:
+            return JSONResponse(status_code=400, content={"message": "Invalid SSML format: <voice> element not found."})
+    except ET.ParseError as e:
+        return JSONResponse(status_code=400, content={"message": "Invalid SSML format", "Exception": str(e)})
+
+    fast_infer = int(fast_infer)
+
+    return StreamingResponse(
+        generate_with_audio(transcription, std_ref_audio, std_ref_text, fast_infer),
+        media_type="audio/wav"
+    )
+
+
+# @app.get("/health_check")
+# async def health_check():
+#     try:
+#         a = torch.ones(10, 20, dtype=torch.float32, device='cuda')
+#         b = torch.ones(20, 10, dtype=torch.float32, device='cuda')
+#         c = torch.matmul(a, b)
+#         if c.sum() == 10 * 20 * 10:
+#             return {"status": "ok"}
+#         else:
+#             raise HTTPException(status_code=503)
+#     except Exception as e:
+#         print(f'health_check failed')
+#         raise HTTPException(status_code=503)
+
+
+if __name__ == "__main__":
+    uvicorn.run("f5_server:app", host="0.0.0.0", port=80)