init ascend tts

This commit is contained in:
2025-09-05 11:27:43 +08:00
parent d53ac91bb6
commit b92a65b0fa
602 changed files with 590901 additions and 1 deletions

View File

@@ -0,0 +1,19 @@
FROM quay.io/ascend/vllm-ascend:v0.10.0rc1
WORKDIR /workspace
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
echo "deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy main restricted universe multiverse" > /etc/apt/sources.list && \
echo "deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \
echo "deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-backports main restricted universe multiverse" >> /etc/apt/sources.list && \
echo "deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-security main restricted universe multiverse" >> /etc/apt/sources.list && \
apt-get update && \
apt-get install -y espeak-ng && \
rm -rf /var/lib/apt/lists/*
COPY requirements_matcha.txt constraints_matcha.txt matcha_server.py launch_matcha.sh /workspace/
RUN pip install -r requirements_matcha.txt -c constraints_matcha.txt
RUN pip install matcha-tts -c constraints_matcha.txt
ENTRYPOINT ["/bin/bash", "launch_matcha.sh"]

View File

@@ -0,0 +1,53 @@
# Matcha-TTS
本项目基于 **matcha** 模型封装,提供简洁的 Docker 部署方式,支持 **SSML 输入**,输出 **PCM 原始音频**,可用于语音合成。
---
## Quickstart
### 1. 安装镜像
```bash
docker build -t tts:matcha . -f Dockerfile_matcha
```
### 2. 启动服务
```bash
docker run -it --rm \
-v /models/matcha_model:/mnt/models \
-e ASCEND_VISIBLE_DEVICES=1 \
--device /dev/davinci2:/dev/davinci0 \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
--privileged \
-p 8080:80 \
-e MODEL_DIR=/mnt/models \
-e MODEL_NAME=model.ckpt \
tts:matcha
```
参数说明:
- `MODEL_DIR`:模型所在目录(挂载到容器内 `/mnt/models`
- `MODEL_NAME`:加载的模型文件名(通常为 `.safetensors`
- `-p 8080:80`:将容器内服务端口映射到宿主机 `8080`
### 3. 测试服务
```bash
curl --request POST "http://localhost:8080/tts" \
--header 'Content-Type: application/ssml+xml' \
--header 'User-Agent: curl' \
--data-raw '<speak version="1.0" xml:lang="zh">
<voice xml:lang="zh" xml:gender="Female" name="zh">
今天天气很好,不知道明天天气怎么样。
</voice>
</speak>' \
--output sound.pcm
```
---

View File

@@ -0,0 +1 @@
torch==2.7.1

View File

@@ -0,0 +1,4 @@
#!/bin/bash
python3 matcha_server.py

View File

@@ -0,0 +1,182 @@
import os
model_dir = os.getenv("MODEL_DIR", "/mounted_model")
model_name = os.getenv("MODEL_NAME", "model.ckpt")
import logging
logging.basicConfig(
format="%(asctime)s %(name)-12s %(levelname)-4s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO"),
)
logger = logging.getLogger(__file__)
import wave
import numpy as np
from scipy.signal import resample
import re
from fastapi import FastAPI, Response, Body, HTTPException
from fastapi.responses import StreamingResponse, JSONResponse
from contextlib import asynccontextmanager
import uvicorn
import xml.etree.ElementTree as ET
import torch
torch.set_default_dtype(torch.float32)
_original_hann_window = torch.hann_window
def _safe_hann_window(window_length,
periodic=True,
*,
dtype=None,
layout=torch.strided,
device=None,
requires_grad=False,
**kwargs):
"""
NPU 不支持int64 hann_window, 替换实现
"""
if dtype is None:
dtype = torch.float32
# 总是在 CPU 先生成,绕过 NPU 上的 in-place cos 实现
win = _original_hann_window(
window_length,
periodic=periodic,
dtype=dtype,
layout=layout,
device="cpu",
requires_grad=requires_grad,
**kwargs,
)
if device is not None:
win = win.to(device)
return win
torch.hann_window = _safe_hann_window
from torch import Tensor
from torch.nn import functional as F
from typing import Optional, List
from matcha.cli import load_matcha, load_vocoder, to_waveform, process_text
model = None
vocoder = None
denoiser = None
device = 'npu'
MODEL_SR = int(os.getenv("MODEL_SR", 22050))
speaking_rate = float(os.getenv("SPEAKING_RATE", 1.0))
TARGET_SR = 16000
N_ZEROS = 100
def init():
global model, vocoder, denoiser
ckpt_path = os.path.join(model_dir, model_name)
vocoder_path = os.path.join(model_dir, "generator_v1")
model = load_matcha("custom_model", ckpt_path, device)
vocoder, denoiser = load_vocoder("hifigan_T2_v1", vocoder_path, device)
# warmup:
for _ in generate("你好,欢迎使用语音合成服务。"):
pass
@asynccontextmanager
async def lifespan(app: FastAPI):
init()
yield
pass
app = FastAPI(lifespan=lifespan)
xml_namespace = "{http://www.w3.org/XML/1998/namespace}"
symbols = ',.!?;:()[]{}<>,。!?;:【】《》……"“”_—'
def contains_words(text):
return any(char not in symbols for char in text)
def split_text(text, max_chars=135):
sentences = re.split(r"(?<=[;:.!?])\s+|(?<=[。!?])", text)
sentences = [s.strip() for s in sentences if s.strip()]
return sentences
def audio_postprocess(audio: np.ndarray, ori_sr: int, target_sr: int) -> np.ndarray:
if ori_sr != target_sr:
number_of_samples = int(len(audio) * float(target_sr) / ori_sr)
audio_resampled = resample(audio, number_of_samples)
else:
audio_resampled = audio
if audio.dtype == np.float32:
audio_resampled = np.clip(audio_resampled, -1.0, 1.0)
audio_resampled = (audio_resampled * 32767).astype(np.int16)
return audio_resampled
def generate(texts):
chunks = split_text(texts)
for i, chunk in enumerate(chunks):
try:
text_processed = process_text(0, chunk, device)
except Exception as e:
logger.error(f"Error processing text: {e}")
with torch.inference_mode():
output = model.synthesise(
text_processed["x"],
text_processed["x_lengths"],
n_timesteps=10,
temperature=0.667,
spks=None,
length_scale=speaking_rate
)
output["waveform"] = to_waveform(output["mel"], vocoder, denoiser, denoiser_strength=0.00025)
audio = output["waveform"].detach().cpu().squeeze().numpy()
yield audio_postprocess(audio, MODEL_SR, TARGET_SR).tobytes()
@app.post("/")
@app.post("/tts")
def predict(ssml: str = Body(...)):
try:
root = ET.fromstring(ssml)
voice_element = root.find(".//voice")
if voice_element is not None:
transcription = voice_element.text.strip()
language = voice_element.get(f'{xml_namespace}lang', "zh").strip()
# voice_name = voice_element.get("name", "zh-f-soft-1").strip()
else:
return JSONResponse(status_code=400, content={"message": "Invalid SSML format: <voice> element not found."})
except ET.ParseError as e:
return JSONResponse(status_code=400, content={"message": "Invalid SSML format", "Exception": str(e)})
if not contains_words(transcription):
audio = np.zeros(N_ZEROS, dtype=np.int16).tobytes()
return Response(audio, media_type='audio/wav')
return StreamingResponse(generate(transcription), media_type='audio/wav')
@app.get("/health")
@app.get("/ready")
async def ready():
return JSONResponse(status_code=200, content={"message": "success"})
@app.get("/health_check")
async def health_check():
try:
a = torch.ones(10, 20, dtype=torch.float32, device='cuda')
b = torch.ones(20, 10, dtype=torch.float32, device='cuda')
c = torch.matmul(a, b)
if c.sum() == 10 * 20 * 10:
return {"status": "ok"}
else:
raise HTTPException(status_code=503)
except Exception as e:
print(f'health_check failed')
raise HTTPException(status_code=503)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=80)

View File

@@ -0,0 +1,47 @@
# --------- pytorch --------- #
torch>=2.0.0
torchvision>=0.15.0
lightning>=2.0.0
torchmetrics>=0.11.4
# --------- hydra --------- #
hydra-core==1.3.2
hydra-colorlog==1.2.0
hydra-optuna-sweeper==1.2.0
# --------- loggers --------- #
# wandb
# neptune-client
# mlflow
# comet-ml
# aim>=3.16.2 # no lower than 3.16.2, see https://github.com/aimhubio/aim/issues/2550
# --------- others --------- #
rootutils # standardizing the project root setup
pre-commit # hooks for applying linters on commit
rich # beautiful text formatting in terminal
pytest # tests
# sh # for running bash commands in some tests (linux/macos only)
phonemizer # phonemization of text
tensorboard
librosa
Cython
numpy
einops
inflect
Unidecode
scipy
torchaudio
matplotlib
pandas
conformer==0.3.2
diffusers # developed using version ==0.25.0
notebook
ipywidgets
gradio==3.43.2
gdown
wget
seaborn
fastapi
uvicorn[standard]