init muxi

This commit is contained in:
2025-09-12 11:39:55 +08:00
commit 96ef2da601
602 changed files with 591073 additions and 0 deletions

View File

@@ -0,0 +1,20 @@
FROM git.modelhub.org.cn:9443/enginex-metax/maca-c500-pytorch:2.33.0.6-torch2.6-py310-ubuntu24.04-amd64
WORKDIR /workspace
ENV CONDA_DIR=/opt/conda
ENV PATH=${CONDA_DIR}/bin:${PATH}
RUN set -eux; \
chmod 1777 /tmp; \
mkdir -p /var/tmp/apt-tmp && chmod 1777 /var/tmp/apt-tmp; \
apt-get -o Dir::Temp::=/var/tmp/apt-tmp update && \
DEBIAN_FRONTEND=noninteractive apt-get -o Dir::Temp::=/var/tmp/apt-tmp install -y --no-install-recommends espeak-ng && \
rm -rf /var/lib/apt/lists/* /var/tmp/apt-tmp
COPY requirements_matcha.txt constraints_matcha.txt matcha_server.py launch_matcha.sh /workspace/
RUN pip install -r requirements_matcha.txt -c constraints_matcha.txt
RUN pip install matcha-tts -c constraints_matcha.txt
ENTRYPOINT ["/bin/bash", "launch_matcha.sh"]

View File

@@ -0,0 +1,44 @@
# Matcha-TTS
本项目基于 **matcha** 模型封装,提供简洁的 Docker 部署方式,支持 **SSML 输入**,输出 **PCM 原始音频**,可用于语音合成。
---
## Quickstart
### 1. 安装镜像
```bash
docker build -t tts:matcha . -f Dockerfile_matcha
```
### 2. 启动服务
```bash
metax-docker run -it --rm \
--gpus=[0] \
-v /models/matcha_model:/mnt/models \
-p 8080:80 \
-e MODEL_DIR=/mnt/models \
-e MODEL_NAME=model.ckpt \
tts:matcha
```
参数说明:
- `MODEL_DIR`:模型所在目录(挂载到容器内 `/mnt/models`
- `MODEL_NAME`:加载的模型文件名(通常为 `.safetensors`
- `-p 8080:80`:将容器内服务端口映射到宿主机 `8080`
### 3. 测试服务
```bash
curl --request POST "http://localhost:8080/tts" \
--header 'Content-Type: application/ssml+xml' \
--header 'User-Agent: curl' \
--data-raw '<speak version="1.0" xml:lang="zh">
<voice xml:lang="zh" xml:gender="Female" name="zh">
今天天气很好,不知道明天天气怎么样。
</voice>
</speak>' \
--output sound.pcm
```
---

View File

@@ -0,0 +1 @@
numpy==1.26.4

View File

@@ -0,0 +1,4 @@
#!/bin/bash
python3 matcha_server.py

View File

@@ -0,0 +1,197 @@
import os
model_dir = os.getenv("MODEL_DIR", "/mounted_model")
model_name = os.getenv("MODEL_NAME", "model.ckpt")
import logging
logging.basicConfig(
format="%(asctime)s %(name)-12s %(levelname)-4s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO"),
)
logger = logging.getLogger(__file__)
# enable custom patcher if available
patcher_path = os.path.join(model_dir, "custom_patcher.py")
if os.path.exists(patcher_path):
import shutil
shutil.copyfile(patcher_path, "custom_patcher.py")
try:
import custom_patcher
logger.info("Custom patcher has been applied.")
except ImportError:
logger.info("Failed to import custom_patcher. Ensure it is a valid Python module.")
else:
logger.info("No custom_patcher found.")
import wave
import numpy as np
from scipy.signal import resample
import re
from fastapi import FastAPI, Response, Body, HTTPException
from fastapi.responses import StreamingResponse, JSONResponse
from contextlib import asynccontextmanager
import uvicorn
import xml.etree.ElementTree as ET
import torch
torch.set_num_threads(4)
# torch.backends.cuda.enable_flash_sdp(False)
# torch.backends.cuda.enable_mem_efficient_sdp(False)
# torch.backends.cuda.enable_math_sdp(True)
from torch import Tensor
from torch.nn import functional as F
from typing import Optional, List
# def conv_transpose1d_forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
# if self.padding_mode != 'zeros':
# raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d')
# assert isinstance(self.padding, tuple)
# # One cannot replace List by Tuple or Sequence in "_output_padding" because
# # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
# num_spatial_dims = 1
# output_padding = self._output_padding(
# input, output_size, self.stride, self.padding, self.kernel_size, # type: ignore[arg-type]
# num_spatial_dims, self.dilation) # type: ignore[arg-type]
# with torch.amp.autocast('cuda', dtype=torch.float16):
# return F.conv_transpose1d(
# input, self.weight, self.bias, self.stride, self.padding,
# output_padding, self.groups, self.dilation).float()
# torch.nn.ConvTranspose1d.forward = conv_transpose1d_forward
from matcha.cli import load_matcha, load_vocoder, to_waveform, process_text
model = None
vocoder = None
denoiser = None
device = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_SR = int(os.getenv("MODEL_SR", 22050))
speaking_rate = float(os.getenv("SPEAKING_RATE", 1.0))
TARGET_SR = 16000
N_ZEROS = 100
def init():
global model, vocoder, denoiser
ckpt_path = os.path.join(model_dir, model_name)
vocoder_path = os.path.join(model_dir, "generator_v1")
model = load_matcha("custom_model", ckpt_path, device)
vocoder, denoiser = load_vocoder("hifigan_T2_v1", vocoder_path, device)
# warmup:
for _ in generate("你好,欢迎使用语音合成服务。"):
pass
@asynccontextmanager
async def lifespan(app: FastAPI):
init()
yield
pass
app = FastAPI(lifespan=lifespan)
xml_namespace = "{http://www.w3.org/XML/1998/namespace}"
symbols = ',.!?;:()[]{}<>,。!?;:【】《》……"“”_—'
def contains_words(text):
return any(char not in symbols for char in text)
def split_text(text, max_chars=135):
sentences = re.split(r"(?<=[;:.!?])\s+|(?<=[。!?])", text)
sentences = [s.strip() for s in sentences if s.strip()]
return sentences
# chunks = []
# current_chunk = ""
# for sentence in sentences:
# if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars:
# current_chunk += sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
# else:
# if current_chunk:
# chunks.append(current_chunk.strip())
# current_chunk = sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
# if current_chunk:
# chunks.append(current_chunk.strip())
# return chunks
def audio_postprocess(audio: np.ndarray, ori_sr: int, target_sr: int) -> np.ndarray:
if ori_sr != target_sr:
number_of_samples = int(len(audio) * float(target_sr) / ori_sr)
audio_resampled = resample(audio, number_of_samples)
else:
audio_resampled = audio
if audio.dtype == np.float32:
audio_resampled = np.clip(audio_resampled, -1.0, 1.0)
audio_resampled = (audio_resampled * 32767).astype(np.int16)
return audio_resampled
def generate(texts):
chunks = split_text(texts)
for i, chunk in enumerate(chunks):
try:
text_processed = process_text(0, chunk, device)
except Exception as e:
logger.error(f"Error processing text: {e}")
with torch.inference_mode():
output = model.synthesise(
text_processed["x"],
text_processed["x_lengths"],
n_timesteps=10,
temperature=0.667,
spks=None,
length_scale=speaking_rate
)
output["waveform"] = to_waveform(output["mel"], vocoder, denoiser, denoiser_strength=0.00025)
audio = output["waveform"].detach().cpu().squeeze().numpy()
yield audio_postprocess(audio, MODEL_SR, TARGET_SR).tobytes()
@app.post("/")
@app.post("/tts")
def predict(ssml: str = Body(...)):
try:
root = ET.fromstring(ssml)
voice_element = root.find(".//voice")
if voice_element is not None:
transcription = voice_element.text.strip()
language = voice_element.get(f'{xml_namespace}lang', "zh").strip()
# voice_name = voice_element.get("name", "zh-f-soft-1").strip()
else:
return JSONResponse(status_code=400, content={"message": "Invalid SSML format: <voice> element not found."})
except ET.ParseError as e:
return JSONResponse(status_code=400, content={"message": "Invalid SSML format", "Exception": str(e)})
if not contains_words(transcription):
audio = np.zeros(N_ZEROS, dtype=np.int16).tobytes()
return Response(audio, media_type='audio/wav')
return StreamingResponse(generate(transcription), media_type='audio/wav')
@app.get("/health")
@app.get("/ready")
async def ready():
return JSONResponse(status_code=200, content={"message": "success"})
@app.get("/health_check")
async def health_check():
try:
a = torch.ones(10, 20, dtype=torch.float32, device='cuda')
b = torch.ones(20, 10, dtype=torch.float32, device='cuda')
c = torch.matmul(a, b)
if c.sum() == 10 * 20 * 10:
return {"status": "ok"}
else:
raise HTTPException(status_code=503)
except Exception as e:
print(f'health_check failed')
raise HTTPException(status_code=503)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=80)

View File

@@ -0,0 +1,47 @@
# --------- pytorch --------- #
torch>=2.0.0
torchvision>=0.15.0
lightning>=2.0.0
torchmetrics>=0.11.4
# --------- hydra --------- #
hydra-core==1.3.2
hydra-colorlog==1.2.0
hydra-optuna-sweeper==1.2.0
# --------- loggers --------- #
# wandb
# neptune-client
# mlflow
# comet-ml
# aim>=3.16.2 # no lower than 3.16.2, see https://github.com/aimhubio/aim/issues/2550
# --------- others --------- #
rootutils # standardizing the project root setup
pre-commit # hooks for applying linters on commit
rich # beautiful text formatting in terminal
pytest # tests
# sh # for running bash commands in some tests (linux/macos only)
phonemizer # phonemization of text
tensorboard
librosa
Cython
numpy
einops
inflect
Unidecode
scipy
torchaudio
matplotlib
pandas
conformer==0.3.2
diffusers # developed using version ==0.25.0
notebook
ipywidgets
gradio==3.43.2
gdown
wget
seaborn
fastapi
uvicorn[standard]