update README

This commit is contained in:
2025-09-10 10:47:02 +08:00
parent 5088f0b50a
commit ff78032400
603 changed files with 21 additions and 23 deletions

View File

@@ -0,0 +1,19 @@
FROM git.modelhub.org.cn:9443/enginex-cambricon/mlu370-pytorch:v25.01-torch2.5.0-torchmlu1.24.1-ubuntu22.04-py310
WORKDIR /workspace
ENV PATH=/torch/venv3/pytorch_infer/bin:/workspace/ffmpeg-mlu-v4.2.0/install/bin:/usr/local/neuware/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
RUN sed -i 's|source /torch/venv3/pytorch/bin/activate|source /torch/venv3/pytorch_infer/bin/activate|' /root/.bashrc
WORKDIR /workspace
RUN apt-get update && \
apt-get install -y espeak-ng && \
rm -rf /var/lib/apt/lists/*
COPY requirements_matcha.txt constraints_matcha.txt matcha_server.py launch_matcha.sh /workspace/
RUN pip install -r requirements_matcha.txt
RUN pip install matcha-tts
ENTRYPOINT ["/bin/bash", "launch_matcha.sh"]

44
mlu_370-matcha/README.md Normal file
View File

@@ -0,0 +1,44 @@
# Matcha-TTS
本项目基于 **matcha** 模型封装,提供简洁的 Docker 部署方式,支持 **SSML 输入**,输出 **PCM 原始音频**,可用于语音合成。
---
## Quickstart
### 1. 安装镜像
```bash
docker build -t tts:matcha . -f Dockerfile_matcha
```
### 2. 启动服务
```bash
docker run -it --rm \
-v /models/matcha_model:/mnt/models \
--device=/dev/cambricon_dev0:/dev/cambricon_dev0 \
--device=/dev/cambricon_ctl:/dev/cambricon_ctl \
-p 8080:80 \
-e MODEL_DIR=/mnt/models \
-e MODEL_NAME=model.ckpt \
tts:matcha
```
参数说明:
- `MODEL_DIR`:模型所在目录(挂载到容器内 `/mnt/models`
- `MODEL_NAME`:加载的模型文件名(通常为 `.safetensors`
- `-p 8080:80`:将容器内服务端口映射到宿主机 `8080`
### 3. 测试服务
```bash
curl --request POST "http://localhost:8080/tts" \
--header 'Content-Type: application/ssml+xml' \
--header 'User-Agent: curl' \
--data-raw '<speak version="1.0" xml:lang="zh">
<voice xml:lang="zh" xml:gender="Female" name="zh">
今天天气很好,不知道明天天气怎么样。
</voice>
</speak>' \
--output sound.pcm
```
---

View File

@@ -0,0 +1,3 @@
torch==2.1.0+corex.3.2.1
numpy==1.23.5
scipy==1.14.1

View File

@@ -0,0 +1,4 @@
#!/bin/bash
python3 matcha_server.py

View File

@@ -0,0 +1,198 @@
import os
model_dir = os.getenv("MODEL_DIR", "/mounted_model")
model_name = os.getenv("MODEL_NAME", "model.ckpt")
import logging
logging.basicConfig(
format="%(asctime)s %(name)-12s %(levelname)-4s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO"),
)
logger = logging.getLogger(__file__)
# enable custom patcher if available
# patcher_path = os.path.join(model_dir, "custom_patcher.py")
# if os.path.exists(patcher_path):
# import shutil
# shutil.copyfile(patcher_path, "custom_patcher.py")
# try:
# import custom_patcher
# logger.info("Custom patcher has been applied.")
# except ImportError:
# logger.info("Failed to import custom_patcher. Ensure it is a valid Python module.")
# else:
# logger.info("No custom_patcher found.")
import wave
import numpy as np
from scipy.signal import resample
import re
from fastapi import FastAPI, Response, Body, HTTPException
from fastapi.responses import StreamingResponse, JSONResponse
from contextlib import asynccontextmanager
import uvicorn
import xml.etree.ElementTree as ET
import torch
torch.set_num_threads(4)
# torch.backends.cuda.enable_flash_sdp(False)
# torch.backends.cuda.enable_mem_efficient_sdp(False)
# torch.backends.cuda.enable_math_sdp(True)
from torch import Tensor
from torch.nn import functional as F
from typing import Optional, List
# def conv_transpose1d_forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
# if self.padding_mode != 'zeros':
# raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d')
# assert isinstance(self.padding, tuple)
# # One cannot replace List by Tuple or Sequence in "_output_padding" because
# # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
# num_spatial_dims = 1
# output_padding = self._output_padding(
# input, output_size, self.stride, self.padding, self.kernel_size, # type: ignore[arg-type]
# num_spatial_dims, self.dilation) # type: ignore[arg-type]
# with torch.amp.autocast('cuda', dtype=torch.float16):
# return F.conv_transpose1d(
# input, self.weight, self.bias, self.stride, self.padding,
# output_padding, self.groups, self.dilation).float()
# torch.nn.ConvTranspose1d.forward = conv_transpose1d_forward
from matcha.cli import load_matcha, load_vocoder, to_waveform, process_text
model = None
vocoder = None
denoiser = None
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'mlu'
MODEL_SR = int(os.getenv("MODEL_SR", 22050))
speaking_rate = float(os.getenv("SPEAKING_RATE", 1.0))
TARGET_SR = 16000
N_ZEROS = 100
def init():
global model, vocoder, denoiser
ckpt_path = os.path.join(model_dir, model_name)
vocoder_path = os.path.join(model_dir, "generator_v1")
model = load_matcha("custom_model", ckpt_path, device)
vocoder, denoiser = load_vocoder("hifigan_T2_v1", vocoder_path, device)
# warmup:
for _ in generate("你好,欢迎使用语音合成服务。"):
pass
@asynccontextmanager
async def lifespan(app: FastAPI):
init()
yield
pass
app = FastAPI(lifespan=lifespan)
xml_namespace = "{http://www.w3.org/XML/1998/namespace}"
symbols = ',.!?;:()[]{}<>,。!?;:【】《》……"“”_—'
def contains_words(text):
return any(char not in symbols for char in text)
def split_text(text, max_chars=135):
sentences = re.split(r"(?<=[;:.!?])\s+|(?<=[。!?])", text)
sentences = [s.strip() for s in sentences if s.strip()]
return sentences
# chunks = []
# current_chunk = ""
# for sentence in sentences:
# if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars:
# current_chunk += sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
# else:
# if current_chunk:
# chunks.append(current_chunk.strip())
# current_chunk = sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
# if current_chunk:
# chunks.append(current_chunk.strip())
# return chunks
def audio_postprocess(audio: np.ndarray, ori_sr: int, target_sr: int) -> np.ndarray:
if ori_sr != target_sr:
number_of_samples = int(len(audio) * float(target_sr) / ori_sr)
audio_resampled = resample(audio, number_of_samples)
else:
audio_resampled = audio
if audio.dtype == np.float32:
audio_resampled = np.clip(audio_resampled, -1.0, 1.0)
audio_resampled = (audio_resampled * 32767).astype(np.int16)
return audio_resampled
def generate(texts):
chunks = split_text(texts)
for i, chunk in enumerate(chunks):
try:
text_processed = process_text(0, chunk, device)
except Exception as e:
logger.error(f"Error processing text: {e}")
with torch.inference_mode():
output = model.synthesise(
text_processed["x"],
text_processed["x_lengths"],
n_timesteps=10,
temperature=0.667,
spks=None,
length_scale=speaking_rate
)
output["waveform"] = to_waveform(output["mel"], vocoder, denoiser, denoiser_strength=0.00025)
audio = output["waveform"].detach().cpu().squeeze().numpy()
yield audio_postprocess(audio, MODEL_SR, TARGET_SR).tobytes()
@app.post("/")
@app.post("/tts")
def predict(ssml: str = Body(...)):
try:
root = ET.fromstring(ssml)
voice_element = root.find(".//voice")
if voice_element is not None:
transcription = voice_element.text.strip()
language = voice_element.get(f'{xml_namespace}lang', "zh").strip()
# voice_name = voice_element.get("name", "zh-f-soft-1").strip()
else:
return JSONResponse(status_code=400, content={"message": "Invalid SSML format: <voice> element not found."})
except ET.ParseError as e:
return JSONResponse(status_code=400, content={"message": "Invalid SSML format", "Exception": str(e)})
if not contains_words(transcription):
audio = np.zeros(N_ZEROS, dtype=np.int16).tobytes()
return Response(audio, media_type='audio/wav')
return StreamingResponse(generate(transcription), media_type='audio/wav')
@app.get("/health")
@app.get("/ready")
async def ready():
return JSONResponse(status_code=200, content={"message": "success"})
@app.get("/health_check")
async def health_check():
try:
a = torch.ones(10, 20, dtype=torch.float32, device='cuda')
b = torch.ones(20, 10, dtype=torch.float32, device='cuda')
c = torch.matmul(a, b)
if c.sum() == 10 * 20 * 10:
return {"status": "ok"}
else:
raise HTTPException(status_code=503)
except Exception as e:
print(f'health_check failed')
raise HTTPException(status_code=503)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=80)

View File

@@ -0,0 +1,47 @@
# --------- pytorch --------- #
torch>=2.0.0
torchvision>=0.15.0
lightning>=2.0.0
torchmetrics>=0.11.4
# --------- hydra --------- #
hydra-core==1.3.2
hydra-colorlog==1.2.0
hydra-optuna-sweeper==1.2.0
# --------- loggers --------- #
# wandb
# neptune-client
# mlflow
# comet-ml
# aim>=3.16.2 # no lower than 3.16.2, see https://github.com/aimhubio/aim/issues/2550
# --------- others --------- #
rootutils # standardizing the project root setup
pre-commit # hooks for applying linters on commit
rich # beautiful text formatting in terminal
pytest # tests
# sh # for running bash commands in some tests (linux/macos only)
phonemizer # phonemization of text
tensorboard
librosa
Cython
numpy
einops
inflect
Unidecode
scipy
torchaudio
matplotlib
pandas
conformer==0.3.2
diffusers # developed using version ==0.25.0
notebook
ipywidgets
gradio==3.43.2
gdown
wget
seaborn
fastapi
uvicorn[standard]