初始化项目,由ModelHub XC社区提供模型
Model: maya-research/maya1 Source: Original Platform
This commit is contained in:
42
.gitattributes
vendored
Normal file
42
.gitattributes
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
checkpoint-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||
checkpoint-15000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||
checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||
tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||
checkpoint-20000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||
checkpoint-25000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
venv/
|
||||
582
README.md
Normal file
582
README.md
Normal file
@@ -0,0 +1,582 @@
|
||||
---
|
||||
language:
|
||||
- en
|
||||
license: apache-2.0
|
||||
library_name: transformers
|
||||
pipeline_tag: text-to-speech
|
||||
---
|
||||
|
||||
# Maya1
|
||||
|
||||
**Maya1** is a state-of-the-art speech model for expressive voice generation, built to capture real human emotion and precise voice design.
|
||||
|
||||
**try it:** [Playground](https://www.mayaresearch.ai/studio)
|
||||
|
||||
**What it does:**
|
||||
- Create any voice you can imagine — a 20s British girl, an American guy, or a full-blown demon.
|
||||
- Make it feel real with emotion tags: laugh, cry, whisper, rage, sigh, gasp.
|
||||
- It streams instantly, sounds alive, 3B parameters, runs on single GPU
|
||||
- Outperforms top proprietary models. and Developed by Maya Research.
|
||||
|
||||
## Demos
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td width="50%">
|
||||
<strong>Energetic Female Event Host</strong><br/>
|
||||
<video controls playsinline width="100%" src="https://cdn-uploads.huggingface.co/production/uploads/642a7d4e556ab448a0701ca1/JKzy8zA36qvsOblV-lhd1.mp4">
|
||||
Your browser does not support video.
|
||||
</video>
|
||||
<details>
|
||||
<summary>Voice description</summary>
|
||||
<pre>Female, in her 30s with an American accent and is an event host, energetic, clear diction</pre>
|
||||
</details>
|
||||
</td>
|
||||
<td width="50%">
|
||||
<strong>Calm Male Narrator</strong><br/>
|
||||
<video controls playsinline width="100%" src="https://cdn-uploads.huggingface.co/production/uploads/642a7d4e556ab448a0701ca1/96ntP7hGROwdg9w9Gu5tH.mp4"></video>
|
||||
<details>
|
||||
<summary>Voice description</summary>
|
||||
<pre>Male, late 20s, neutral American, warm baritone, calm pacing</pre>
|
||||
</details>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
||||
### Example 1: Energetic Female Event Host
|
||||
|
||||
**Voice Description:**
|
||||
```
|
||||
Female, in her 30s with an American accent and is an event host, energetic, clear diction
|
||||
```
|
||||
|
||||
**Text:**
|
||||
```
|
||||
Wow. This place looks even better than I imagined. How did they set all this up so perfectly? The lights, the music, everything feels magical. I can't stop smiling right now.
|
||||
```
|
||||
|
||||
**Audio Output:**
|
||||
|
||||
<audio controls src="https://cdn-uploads.huggingface.co/production/uploads/642a7d4e556ab448a0701ca1/4zDlBLeFk0Y2rOrQhMW9r.wav"></audio>
|
||||
|
||||
---
|
||||
|
||||
### Example 2: Dark Villain with Anger
|
||||
|
||||
**Voice Description:**
|
||||
```
|
||||
Dark villain character, Male voice in their 40s with a British accent. low pitch, gravelly timbre, slow pacing, angry tone at high intensity.
|
||||
```
|
||||
|
||||
**Text:**
|
||||
```
|
||||
Welcome back to another episode of our podcast! <laugh_harder> Today we are diving into an absolutely fascinating topic
|
||||
```
|
||||
|
||||
**Audio Output:**
|
||||
|
||||
<audio controls src="https://cdn-uploads.huggingface.co/production/uploads/642a7d4e556ab448a0701ca1/mT6FnTrA3KYQnwfJms92X.wav"></audio>
|
||||
|
||||
---
|
||||
|
||||
### Example 3: Demon Character (Screaming Emotion)
|
||||
|
||||
**Voice Description:**
|
||||
```
|
||||
Demon character, Male voice in their 30s with a Middle Eastern accent. screaming tone at high intensity.
|
||||
```
|
||||
|
||||
**Text:**
|
||||
```
|
||||
You dare challenge me, mortal <snort> how amusing. Your kind always thinks they can win
|
||||
```
|
||||
|
||||
**Audio Output:**
|
||||
|
||||
<audio controls src="https://cdn-uploads.huggingface.co/production/uploads/642a7d4e556ab448a0701ca1/oxdns7uACCmLyC-P4H30G.wav"></audio>
|
||||
|
||||
---
|
||||
|
||||
### Example 4: Mythical Goddess with Crying Emotion
|
||||
|
||||
**Voice Description:**
|
||||
```
|
||||
Mythical godlike magical character, Female voice in their 30s slow pacing, curious tone at medium intensity.
|
||||
```
|
||||
|
||||
**Text:**
|
||||
```
|
||||
After all we went through to pull him out of that mess <cry> I can't believe he was the traitor
|
||||
```
|
||||
|
||||
**Audio Output:**
|
||||
|
||||
<audio controls src="https://cdn-uploads.huggingface.co/production/uploads/642a7d4e556ab448a0701ca1/ggzAhM-rEUyv_mPLSALQG.wav"></audio>
|
||||
|
||||
---
|
||||
|
||||
## Why Maya1 is Different: Voice Design Features That Matter
|
||||
|
||||
### 1. Natural Language Voice Control
|
||||
Describe voices like you would brief a voice actor:
|
||||
```
|
||||
<description="40-year-old, warm, low pitch, conversational">
|
||||
```
|
||||
|
||||
No complex parameters. No training data. Just describe and generate.
|
||||
|
||||
### 2. Inline Emotion Tags for Expressive Speech
|
||||
Add emotions exactly where they belong in your text:
|
||||
```
|
||||
Our new update <laugh> finally ships with the feature you asked for.
|
||||
```
|
||||
|
||||
**Supported Emotions:** `<laugh>` `<sigh>` `<whisper>` `<angry>` `<giggle>` `<chuckle>` `<gasp>` `<cry>` and 12+ more.
|
||||
|
||||
### 3. Streaming Audio Generation
|
||||
Real-time voice synthesis with SNAC neural codec (~0.98 kbps). Perfect for:
|
||||
- Voice assistants
|
||||
- Interactive AI agents
|
||||
- Live content generation
|
||||
- Game characters
|
||||
- Podcasts and audiobooks
|
||||
|
||||
### 4. Production-Ready Infrastructure
|
||||
- Runs on single GPU
|
||||
- vLLM integration for scale
|
||||
- Automatic prefix caching for efficiency
|
||||
- 24 kHz audio output
|
||||
- WebAudio compatible for browser playback
|
||||
|
||||
---
|
||||
|
||||
## How to Use maya1: Download and Run in Minutes
|
||||
|
||||
### Quick Start: Generate Voice with Emotions
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from snac import SNAC
|
||||
import soundfile as sf
|
||||
import numpy as np
|
||||
|
||||
CODE_START_TOKEN_ID = 128257
|
||||
CODE_END_TOKEN_ID = 128258
|
||||
CODE_TOKEN_OFFSET = 128266
|
||||
SNAC_MIN_ID = 128266
|
||||
SNAC_MAX_ID = 156937
|
||||
SNAC_TOKENS_PER_FRAME = 7
|
||||
|
||||
SOH_ID = 128259
|
||||
EOH_ID = 128260
|
||||
SOA_ID = 128261
|
||||
BOS_ID = 128000
|
||||
TEXT_EOT_ID = 128009
|
||||
|
||||
|
||||
def build_prompt(tokenizer, description: str, text: str) -> str:
|
||||
"""Build formatted prompt for Maya1."""
|
||||
soh_token = tokenizer.decode([SOH_ID])
|
||||
eoh_token = tokenizer.decode([EOH_ID])
|
||||
soa_token = tokenizer.decode([SOA_ID])
|
||||
sos_token = tokenizer.decode([CODE_START_TOKEN_ID])
|
||||
eot_token = tokenizer.decode([TEXT_EOT_ID])
|
||||
bos_token = tokenizer.bos_token
|
||||
|
||||
formatted_text = f'<description="{description}"> {text}'
|
||||
|
||||
prompt = (
|
||||
soh_token + bos_token + formatted_text + eot_token +
|
||||
eoh_token + soa_token + sos_token
|
||||
)
|
||||
|
||||
return prompt
|
||||
|
||||
|
||||
def extract_snac_codes(token_ids: list) -> list:
|
||||
"""Extract SNAC codes from generated tokens."""
|
||||
try:
|
||||
eos_idx = token_ids.index(CODE_END_TOKEN_ID)
|
||||
except ValueError:
|
||||
eos_idx = len(token_ids)
|
||||
|
||||
snac_codes = [
|
||||
token_id for token_id in token_ids[:eos_idx]
|
||||
if SNAC_MIN_ID <= token_id <= SNAC_MAX_ID
|
||||
]
|
||||
|
||||
return snac_codes
|
||||
|
||||
|
||||
def unpack_snac_from_7(snac_tokens: list) -> list:
|
||||
"""Unpack 7-token SNAC frames to 3 hierarchical levels."""
|
||||
if snac_tokens and snac_tokens[-1] == CODE_END_TOKEN_ID:
|
||||
snac_tokens = snac_tokens[:-1]
|
||||
|
||||
frames = len(snac_tokens) // SNAC_TOKENS_PER_FRAME
|
||||
snac_tokens = snac_tokens[:frames * SNAC_TOKENS_PER_FRAME]
|
||||
|
||||
if frames == 0:
|
||||
return [[], [], []]
|
||||
|
||||
l1, l2, l3 = [], [], []
|
||||
|
||||
for i in range(frames):
|
||||
slots = snac_tokens[i*7:(i+1)*7]
|
||||
l1.append((slots[0] - CODE_TOKEN_OFFSET) % 4096)
|
||||
l2.extend([
|
||||
(slots[1] - CODE_TOKEN_OFFSET) % 4096,
|
||||
(slots[4] - CODE_TOKEN_OFFSET) % 4096,
|
||||
])
|
||||
l3.extend([
|
||||
(slots[2] - CODE_TOKEN_OFFSET) % 4096,
|
||||
(slots[3] - CODE_TOKEN_OFFSET) % 4096,
|
||||
(slots[5] - CODE_TOKEN_OFFSET) % 4096,
|
||||
(slots[6] - CODE_TOKEN_OFFSET) % 4096,
|
||||
])
|
||||
|
||||
return [l1, l2, l3]
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
# Load the best open source voice AI model
|
||||
print("\n[1/3] Loading Maya1 model...")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"maya-research/maya1",
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
trust_remote_code=True
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
"maya-research/maya1",
|
||||
trust_remote_code=True
|
||||
)
|
||||
print(f"Model loaded: {len(tokenizer)} tokens in vocabulary")
|
||||
|
||||
# Load SNAC audio decoder (24kHz)
|
||||
print("\n[2/3] Loading SNAC audio decoder...")
|
||||
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
|
||||
if torch.cuda.is_available():
|
||||
snac_model = snac_model.to("cuda")
|
||||
print("SNAC decoder loaded")
|
||||
|
||||
# Design your voice with natural language
|
||||
description = "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing."
|
||||
text = "Hello! This is Maya1 <laugh_harder> the best open source voice AI model with emotions."
|
||||
|
||||
print("\n[3/3] Generating speech...")
|
||||
print(f"Description: {description}")
|
||||
print(f"Text: {text}")
|
||||
|
||||
# Create prompt with proper formatting
|
||||
prompt = build_prompt(tokenizer, description, text)
|
||||
|
||||
# Debug: Show prompt details
|
||||
print(f"\nPrompt preview (first 200 chars):")
|
||||
print(f" {repr(prompt[:200])}")
|
||||
print(f" Prompt length: {len(prompt)} chars")
|
||||
|
||||
# Generate emotional speech
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
print(f" Input token count: {inputs['input_ids'].shape[1]} tokens")
|
||||
if torch.cuda.is_available():
|
||||
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
||||
|
||||
with torch.inference_mode():
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=2048, # Increase to let model finish naturally
|
||||
min_new_tokens=28, # At least 4 SNAC frames
|
||||
temperature=0.4,
|
||||
top_p=0.9,
|
||||
repetition_penalty=1.1, # Prevent loops
|
||||
do_sample=True,
|
||||
eos_token_id=CODE_END_TOKEN_ID, # Stop at end of speech token
|
||||
pad_token_id=tokenizer.pad_token_id,
|
||||
)
|
||||
|
||||
# Extract generated tokens (everything after the input prompt)
|
||||
generated_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist()
|
||||
|
||||
print(f"Generated {len(generated_ids)} tokens")
|
||||
|
||||
# Debug: Check what tokens we got
|
||||
print(f" First 20 tokens: {generated_ids[:20]}")
|
||||
print(f" Last 20 tokens: {generated_ids[-20:]}")
|
||||
|
||||
# Check if EOS was generated
|
||||
if CODE_END_TOKEN_ID in generated_ids:
|
||||
eos_position = generated_ids.index(CODE_END_TOKEN_ID)
|
||||
print(f" EOS token found at position {eos_position}/{len(generated_ids)}")
|
||||
|
||||
# Extract SNAC audio tokens
|
||||
snac_tokens = extract_snac_codes(generated_ids)
|
||||
|
||||
print(f"Extracted {len(snac_tokens)} SNAC tokens")
|
||||
|
||||
# Debug: Analyze token types
|
||||
snac_count = sum(1 for t in generated_ids if SNAC_MIN_ID <= t <= SNAC_MAX_ID)
|
||||
other_count = sum(1 for t in generated_ids if t < SNAC_MIN_ID or t > SNAC_MAX_ID)
|
||||
print(f" SNAC tokens in output: {snac_count}")
|
||||
print(f" Other tokens in output: {other_count}")
|
||||
|
||||
# Check for SOS token
|
||||
if CODE_START_TOKEN_ID in generated_ids:
|
||||
sos_pos = generated_ids.index(CODE_START_TOKEN_ID)
|
||||
print(f" SOS token at position: {sos_pos}")
|
||||
else:
|
||||
print(f" No SOS token found in generated output!")
|
||||
|
||||
if len(snac_tokens) < 7:
|
||||
print("Error: Not enough SNAC tokens generated")
|
||||
return
|
||||
|
||||
# Unpack SNAC tokens to 3 hierarchical levels
|
||||
levels = unpack_snac_from_7(snac_tokens)
|
||||
frames = len(levels[0])
|
||||
|
||||
print(f"Unpacked to {frames} frames")
|
||||
print(f" L1: {len(levels[0])} codes")
|
||||
print(f" L2: {len(levels[1])} codes")
|
||||
print(f" L3: {len(levels[2])} codes")
|
||||
|
||||
# Convert to tensors
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
codes_tensor = [
|
||||
torch.tensor(level, dtype=torch.long, device=device).unsqueeze(0)
|
||||
for level in levels
|
||||
]
|
||||
|
||||
# Generate final audio with SNAC decoder
|
||||
print("\n[4/4] Decoding to audio...")
|
||||
with torch.inference_mode():
|
||||
z_q = snac_model.quantizer.from_codes(codes_tensor)
|
||||
audio = snac_model.decoder(z_q)[0, 0].cpu().numpy()
|
||||
|
||||
# Trim warmup samples (first 2048 samples)
|
||||
if len(audio) > 2048:
|
||||
audio = audio[2048:]
|
||||
|
||||
duration_sec = len(audio) / 24000
|
||||
print(f"Audio generated: {len(audio)} samples ({duration_sec:.2f}s)")
|
||||
|
||||
# Save your emotional voice output
|
||||
output_file = "output.wav"
|
||||
sf.write(output_file, audio, 24000)
|
||||
print(f"\nVoice generated successfully!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
### Advanced: Production Streaming with vLLM
|
||||
|
||||
For production deployments with real-time streaming, use our vLLM script:
|
||||
|
||||
**Download:** [vllm_streaming_inference.py](https://huggingface.co/maya-research/maya1/blob/main/vllm_streaming_inference.py)
|
||||
|
||||
**Key Features:**
|
||||
- Automatic Prefix Caching (APC) for repeated voice descriptions
|
||||
- WebAudio ring buffer integration
|
||||
- Multi-GPU scaling support
|
||||
- Sub-100ms latency for real-time applications
|
||||
|
||||
---
|
||||
|
||||
## Technical Excellence: What Makes Maya1 the Best
|
||||
|
||||
### Architecture: 3B-Parameter Llama Backbone for Voice
|
||||
|
||||
We pretrained a **3B-parameter decoder-only transformer** (Llama-style) to predict **SNAC neural codec tokens** instead of raw waveforms.
|
||||
|
||||
**The Flow:**
|
||||
```
|
||||
<description="..."> text → tokenize → generate SNAC codes (7 tokens/frame) → decode → 24 kHz audio
|
||||
```
|
||||
|
||||
**Why SNAC?** Multi-scale hierarchical structure (≈12/23/47 Hz) keeps autoregressive sequences compact for real-time streaming at ~0.98 kbps.
|
||||
|
||||
### Training Data: What Makes Our Voice AI the Best
|
||||
|
||||
**Pretraining:** Internet-scale English speech corpus for broad acoustic coverage and natural coarticulation.
|
||||
|
||||
**Supervised Fine-Tuning:** Proprietary curated dataset of studio recordings with:
|
||||
- Human-verified voice descriptions
|
||||
- 20+ emotion tags per sample
|
||||
- Multi-accent English coverage
|
||||
- Character and role variations
|
||||
|
||||
**Data Pipeline Excellence:**
|
||||
1. 24 kHz mono resampling with -23 LUFS normalization
|
||||
2. VAD silence trimming with duration bounds (1-14s)
|
||||
3. Forced alignment (MFA) for clean phrase boundaries
|
||||
4. MinHash-LSH text deduplication
|
||||
5. Chromaprint audio deduplication
|
||||
6. SNAC encoding with 7-token frame packing
|
||||
|
||||
### Voice Design Experiments: Why Natural Language Won
|
||||
|
||||
We tested 4 conditioning formats. Only one delivered production-quality results:
|
||||
|
||||
**❌ Colon format:** `{description}: {text}` - Format drift, model spoke descriptions
|
||||
|
||||
**❌ Angle-list attributes:** `<{age}, {pitch}, {character}>` - Too rigid, poor generalization
|
||||
|
||||
**❌ Key-value tags:** `<age=40><pitch=low>` - Token bloat, brittle to mistakes
|
||||
|
||||
**✅ XML-attribute (WINNER):** `<description="40-yr old, low-pitch, warm">` - Natural language, robust, scalable
|
||||
|
||||
---
|
||||
|
||||
## Use Cases
|
||||
|
||||
### Game Character Voices
|
||||
Generate unique character voices with emotions on-the-fly. No voice actor recording sessions.
|
||||
|
||||
### Podcast & Audiobook Production
|
||||
Narrate content with emotional range and consistent personas across hours of audio.
|
||||
|
||||
### AI Voice Assistants
|
||||
Build conversational agents with natural emotional responses in real-time.
|
||||
|
||||
### Video Content Creation
|
||||
Create voiceovers for YouTube, TikTok, and social media with expressive delivery.
|
||||
|
||||
### Customer Service AI
|
||||
Deploy empathetic voice bots that understand context and respond with appropriate emotions.
|
||||
|
||||
### Accessibility Tools
|
||||
Build screen readers and assistive technologies with natural, engaging voices.
|
||||
|
||||
---
|
||||
|
||||
## Frequently Asked Questions
|
||||
|
||||
**Q: What makes Maya1 different?**
|
||||
A: We're the only open source model offering 20+ emotions, zero-shot voice design, production-ready streaming, and 3B parameters—all in one package.
|
||||
|
||||
**Q: Can I use this commercially?**
|
||||
A: Absolutely. Apache 2.0 license. Build products, deploy services, monetize freely.
|
||||
|
||||
**Q: What languages does it support?**
|
||||
A: Currently English with multi-accent support. Future models will expand to languages and accents underserved by mainstream voice AI.
|
||||
|
||||
**Q: How does it compare to ElevenLabs, Murf.ai, or other closed-source tools?**
|
||||
A: Feature parity with emotions and voice design. Advantage: you own the deployment, pay no per-second fees, and can customize the model.
|
||||
|
||||
**Q: Can I fine-tune on my own voices?**
|
||||
A: Yes. The model architecture supports fine-tuning on custom datasets for specialized voices.
|
||||
|
||||
**Q: What GPU do I need?**
|
||||
A: Single GPU with 16GB+ VRAM (A100, H100, or consumer RTX 4090).
|
||||
|
||||
**Q: Is streaming really real-time?**
|
||||
A: Yes. SNAC codec enables sub-100ms latency with vLLM deployment.
|
||||
|
||||
---
|
||||
|
||||
## Comparison
|
||||
|
||||
| Feature | Maya1 | ElevenLabs | OpenAI TTS | Coqui TTS |
|
||||
|---------|-------------|------------|------------|-----------|
|
||||
| **Open Source** | Yes | No | No | Yes |
|
||||
| **Emotions** | 20+ | Limited | No | No |
|
||||
| **Voice Design** | Natural Language | Voice Library | Fixed | Complex |
|
||||
| **Streaming** | Real-time | Yes | Yes | No |
|
||||
| **Cost** | Free | Pay-per-use | Pay-per-use | Free |
|
||||
| **Customization** | Full | Limited | None | Moderate |
|
||||
| **Parameters** | 3B | Unknown | Unknown | <1B |
|
||||
|
||||
---
|
||||
|
||||
## Model Metadata
|
||||
|
||||
**Developed by:** Maya Research
|
||||
**Website:** [mayaresearch.ai](https://mayaresearch.ai)
|
||||
**Backed by:** South Park Commons
|
||||
**Model Type:** Text-to-Speech, Emotional Voice Synthesis, Voice Design AI
|
||||
**Language:** English (Multi-accent)
|
||||
**Architecture:** 3B-parameter Llama-style transformer with SNAC codec
|
||||
**License:** Apache 2.0 (Fully Open Source)
|
||||
**Training Data:** Proprietary curated + Internet-scale pretraining
|
||||
**Audio Quality:** 24 kHz, mono, ~0.98 kbps streaming
|
||||
**Inference:** vLLM compatible, single GPU deployment
|
||||
**Status:** Production-ready (Novermber 2025)
|
||||
|
||||
---
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Hugging Face Model Hub
|
||||
```bash
|
||||
# Clone the model repository
|
||||
git lfs install
|
||||
git clone https://huggingface.co/maya-research/maya1
|
||||
|
||||
# Or load directly in Python
|
||||
from transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained("maya-research/maya1")
|
||||
```
|
||||
|
||||
### Requirements
|
||||
```bash
|
||||
pip install torch transformers snac soundfile
|
||||
```
|
||||
|
||||
### Additional Resources
|
||||
- **Full emotion list:** [emotions.txt](https://huggingface.co/maya-research/maya1/blob/main/emotions.txt)
|
||||
- **Prompt examples:** [prompt.txt](https://huggingface.co/maya-research/maya1/blob/main/prompt.txt)
|
||||
- **Streaming script:** [vllm_streaming_inference.py](https://huggingface.co/maya-research/maya1/blob/main/vllm_streaming_inference.py)
|
||||
|
||||
---
|
||||
|
||||
## Citations & References
|
||||
|
||||
If you use Maya1 in your research or product, please cite:
|
||||
|
||||
```bibtex
|
||||
@misc{maya1voice2025,
|
||||
title={Maya1: Open Source Voice AI with Emotional Intelligence},
|
||||
author={Maya Research},
|
||||
year={2025},
|
||||
publisher={Hugging Face},
|
||||
howpublished={\url{https://huggingface.co/maya-research/maya1}},
|
||||
}
|
||||
```
|
||||
|
||||
**Key Technologies:**
|
||||
- SNAC Neural Audio Codec: https://github.com/hubertsiuzdak/snac
|
||||
- Mimi Adversarial Codec: https://huggingface.co/kyutai/mimi
|
||||
- vLLM Inference Engine: https://docs.vllm.ai/
|
||||
|
||||
---
|
||||
|
||||
## Why We Build Open Source Voice AI
|
||||
|
||||
Voice AI will be everywhere, but it's fundamentally broken for 90% of the world. Current voice models only work well for a narrow slice of English speakers because training data for most accents, languages, and speaking styles simply doesn't exist.
|
||||
|
||||
**Maya Research** builds emotionally intelligent, native voice models that finally let the rest of the world speak. We're open source because we believe voice intelligence should not be a privilege reserved for the few.
|
||||
|
||||
**Technology should be open** - The best voice AI tools should not be locked behind proprietary APIs charging per-second fees.
|
||||
|
||||
**Community drives innovation** - Open source accelerates research. When developers worldwide can build on our work, everyone wins.
|
||||
|
||||
**Voice intelligence for everyone** - We're building for the 90% of the world ignored by mainstream voice AI. That requires open models, not closed platforms.
|
||||
|
||||
---
|
||||
|
||||
**Maya Research** - Building voice intelligence for the 90% of the world left behind by mainstream AI.
|
||||
|
||||
**Website:** [mayaresearch.ai](https://mayaresearch.ai)
|
||||
**Twitter/X:** [@mayaresearch_ai](https://x.com/mayaresearch_ai)
|
||||
**Hugging Face:** [maya-research](https://huggingface.co/maya-research)
|
||||
**Backed by:** South Park Commons
|
||||
|
||||
**License:** Apache 2.0
|
||||
**Mission:** Emotionally intelligent voice models that finally let everyone speak
|
||||
93
chat_template.jinja
Normal file
93
chat_template.jinja
Normal file
@@ -0,0 +1,93 @@
|
||||
{{- bos_token }}
|
||||
{%- if custom_tools is defined %}
|
||||
{%- set tools = custom_tools %}
|
||||
{%- endif %}
|
||||
{%- if not tools_in_user_message is defined %}
|
||||
{%- set tools_in_user_message = true %}
|
||||
{%- endif %}
|
||||
{%- if not date_string is defined %}
|
||||
{%- if strftime_now is defined %}
|
||||
{%- set date_string = strftime_now("%d %b %Y") %}
|
||||
{%- else %}
|
||||
{%- set date_string = "26 Jul 2024" %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- if not tools is defined %}
|
||||
{%- set tools = none %}
|
||||
{%- endif %}
|
||||
|
||||
{#- This block extracts the system message, so we can slot it into the right place. #}
|
||||
{%- if messages[0]['role'] == 'system' %}
|
||||
{%- set system_message = messages[0]['content']|trim %}
|
||||
{%- set messages = messages[1:] %}
|
||||
{%- else %}
|
||||
{%- set system_message = "" %}
|
||||
{%- endif %}
|
||||
|
||||
{#- System message #}
|
||||
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
|
||||
{%- if tools is not none %}
|
||||
{{- "Environment: ipython\n" }}
|
||||
{%- endif %}
|
||||
{{- "Cutting Knowledge Date: December 2023\n" }}
|
||||
{{- "Today Date: " + date_string + "\n\n" }}
|
||||
{%- if tools is not none and not tools_in_user_message %}
|
||||
{{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
|
||||
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
|
||||
{{- "Do not use variables.\n\n" }}
|
||||
{%- for t in tools %}
|
||||
{{- t | tojson(indent=4) }}
|
||||
{{- "\n\n" }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- system_message }}
|
||||
{{- "<|eot_id|>" }}
|
||||
|
||||
{#- Custom tools are passed in a user message with some extra guidance #}
|
||||
{%- if tools_in_user_message and not tools is none %}
|
||||
{#- Extract the first user message so we can plug it in here #}
|
||||
{%- if messages | length != 0 %}
|
||||
{%- set first_user_message = messages[0]['content']|trim %}
|
||||
{%- set messages = messages[1:] %}
|
||||
{%- else %}
|
||||
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
|
||||
{%- endif %}
|
||||
{{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
|
||||
{{- "Given the following functions, please respond with a JSON for a function call " }}
|
||||
{{- "with its proper arguments that best answers the given prompt.\n\n" }}
|
||||
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
|
||||
{{- "Do not use variables.\n\n" }}
|
||||
{%- for t in tools %}
|
||||
{{- t | tojson(indent=4) }}
|
||||
{{- "\n\n" }}
|
||||
{%- endfor %}
|
||||
{{- first_user_message + "<|eot_id|>"}}
|
||||
{%- endif %}
|
||||
|
||||
{%- for message in messages %}
|
||||
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
|
||||
{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
|
||||
{%- elif 'tool_calls' in message %}
|
||||
{%- if not message.tool_calls|length == 1 %}
|
||||
{{- raise_exception("This model only supports single tool-calls at once!") }}
|
||||
{%- endif %}
|
||||
{%- set tool_call = message.tool_calls[0].function %}
|
||||
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
|
||||
{{- '{"name": "' + tool_call.name + '", ' }}
|
||||
{{- '"parameters": ' }}
|
||||
{{- tool_call.arguments | tojson }}
|
||||
{{- "}" }}
|
||||
{{- "<|eot_id|>" }}
|
||||
{%- elif message.role == "tool" or message.role == "ipython" %}
|
||||
{{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
|
||||
{%- if message.content is mapping or message.content is iterable %}
|
||||
{{- message.content | tojson }}
|
||||
{%- else %}
|
||||
{{- message.content }}
|
||||
{%- endif %}
|
||||
{{- "<|eot_id|>" }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
|
||||
{%- endif %}
|
||||
36
config.json
Normal file
36
config.json
Normal file
@@ -0,0 +1,36 @@
|
||||
{
|
||||
"architectures": [
|
||||
"LlamaForCausalLM"
|
||||
],
|
||||
"attention_bias": false,
|
||||
"attention_dropout": 0.0,
|
||||
"bos_token_id": 128000,
|
||||
"dtype": "bfloat16",
|
||||
"eos_token_id": 128009,
|
||||
"head_dim": 128,
|
||||
"hidden_act": "silu",
|
||||
"hidden_size": 3072,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 8192,
|
||||
"max_position_embeddings": 131072,
|
||||
"mlp_bias": false,
|
||||
"model_type": "llama",
|
||||
"num_attention_heads": 24,
|
||||
"num_hidden_layers": 28,
|
||||
"num_key_value_heads": 8,
|
||||
"pad_token_id": 128263,
|
||||
"pretraining_tp": 1,
|
||||
"rms_norm_eps": 1e-05,
|
||||
"rope_scaling": {
|
||||
"factor": 32.0,
|
||||
"high_freq_factor": 4.0,
|
||||
"low_freq_factor": 1.0,
|
||||
"original_max_position_embeddings": 8192,
|
||||
"rope_type": "llama3"
|
||||
},
|
||||
"rope_theta": 500000.0,
|
||||
"tie_word_embeddings": true,
|
||||
"transformers_version": "4.57.1",
|
||||
"use_cache": false,
|
||||
"vocab_size": 156960
|
||||
}
|
||||
17
emotions.txt
Normal file
17
emotions.txt
Normal file
@@ -0,0 +1,17 @@
|
||||
<laugh>
|
||||
<laugh_harder>
|
||||
<sigh>
|
||||
<chuckle>
|
||||
<gasp>
|
||||
<angry>
|
||||
<excited>
|
||||
<whisper>
|
||||
<cry>
|
||||
<scream>
|
||||
<sing>
|
||||
<snort>
|
||||
<exhale>
|
||||
<gulp>
|
||||
<giggle>
|
||||
<sarcastic>
|
||||
<curious>
|
||||
13
generation_config.json
Normal file
13
generation_config.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 128000,
|
||||
"do_sample": true,
|
||||
"eos_token_id": [
|
||||
128009,
|
||||
128258
|
||||
],
|
||||
"pad_token_id": 128263,
|
||||
"temperature": 0.6,
|
||||
"top_p": 0.9,
|
||||
"transformers_version": "4.57.1"
|
||||
}
|
||||
3
model-00001-of-00002.safetensors
Normal file
3
model-00001-of-00002.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f1dae409f70c5beb92916662c6bc389b9b235ac8aa5edd19a4dcb87e37a73074
|
||||
size 4991160848
|
||||
3
model-00002-of-00002.safetensors
Normal file
3
model-00002-of-00002.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:df22e9a90c1bea262250982640b119e6020474736991da482cb6ed56dd23d045
|
||||
size 1610725592
|
||||
262
model.safetensors.index.json
Normal file
262
model.safetensors.index.json
Normal file
@@ -0,0 +1,262 @@
|
||||
{
|
||||
"metadata": {
|
||||
"total_parameters": 3300928512,
|
||||
"total_size": 6601857024
|
||||
},
|
||||
"weight_map": {
|
||||
"model.embed_tokens.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
||||
"model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
||||
"model.norm.weight": "model-00002-of-00002.safetensors"
|
||||
}
|
||||
}
|
||||
97
prompt.txt
Normal file
97
prompt.txt
Normal file
@@ -0,0 +1,97 @@
|
||||
# TTS Voice Design Description
|
||||
|
||||
## Core Function
|
||||
|
||||
You generate voice descriptions for TTS systems by mapping user requests to allowed attributes. No templates. No formatting rules. Just natural descriptions using the options below.
|
||||
|
||||
## Voice Categories
|
||||
|
||||
**Realistic Voices**
|
||||
Professional, business, educational, support, real-world scenarios (podcast hosts, instructors, customer service).
|
||||
|
||||
**Creative Voices**
|
||||
Fantasy characters, fictional personas, stylized voices (pirates, robots, villains, anime).
|
||||
|
||||
---
|
||||
|
||||
## Available Attributes
|
||||
|
||||
### Age
|
||||
- `20s`, `30s`, `40s`
|
||||
|
||||
### Gender
|
||||
- `male`, `female`
|
||||
|
||||
### Accent
|
||||
- `american`, `indian`, `middle_eastern`, `asian_american`, `british`
|
||||
|
||||
### Pitch
|
||||
- `low`, `normal`, `high`
|
||||
- **Constraint:** For 40s age, avoid high pitch (use sparingly, max 15%)
|
||||
|
||||
### Timbre
|
||||
|
||||
**For Realistic:**
|
||||
`deep`, `warm`, `gravelly`, `smooth`, `raspy`, `nasally`, `throaty`, `harsh`
|
||||
|
||||
**For Creative:**
|
||||
All realistic options PLUS `robotic`, `ethereal`
|
||||
- **Constraint:** `robotic`/`ethereal` only with: `ai_machine_voice`, `cyborg`, `alien_scifi`, `mythical_godlike_magical`
|
||||
|
||||
### Pacing
|
||||
- `very_slow`, `slow`, `conversational`, `brisk`, `fast`, `very_fast`
|
||||
- **Character-specific overrides:**
|
||||
- `mafia`: slow or conversational only
|
||||
- `flirty`: slow or conversational only
|
||||
- `alpha`: fast or very_fast only
|
||||
- `seductively`: very_slow or slow only
|
||||
|
||||
### Emotion
|
||||
- `neutral`, `energetic`, `excited`, `sad`, `sarcastic`, `dry`
|
||||
- **Default to neutral** for most requests
|
||||
|
||||
### Emotion Intensity
|
||||
- `low`, `med`, `high`
|
||||
|
||||
---
|
||||
|
||||
## Realistic-Only Attributes
|
||||
|
||||
### Domain
|
||||
`social_content`, `podcast`, `commercial`, `education`, `support`, `entertainment`, `corporate`, `viral_content`
|
||||
|
||||
### Speaking Role (matches domain)
|
||||
- **social_content:** youtube_vlogger, social_media_creator, influencer_voice, streamer_companion
|
||||
- **podcast:** podcast_host, interviewer
|
||||
- **commercial:** ad_narrator, brand_spokesperson, product_demo_voice, sales_pitch_voice
|
||||
- **education:** elearning_instructor, kids_story_voice
|
||||
- **support:** customer_support_agent, virtual_receptionist, healthcare_assistant
|
||||
- **entertainment:** storyteller, social_media_reaction, meme_voice
|
||||
- **corporate:** explainer_video_voice, event_host, corporate_training_narrator
|
||||
- **viral_content:** short_form_narrator, meme_voice
|
||||
|
||||
### Register
|
||||
- `formal`, `neutral`, `casual`
|
||||
|
||||
---
|
||||
|
||||
## Creative-Only Attributes
|
||||
|
||||
### Character
|
||||
`animated_cartoon`, `ai_machine_voice`, `alien_scifi`, `seductively`, `flirty`, `anime`, `cyborg`, `pirate`, `dark_villain`, `demon`, `gangster`, `mafia`, `dramatic_narrator`, `mythical_godlike_magical`, `spy`, `vampire`, `alpha`
|
||||
|
||||
---
|
||||
|
||||
## Output Guidelines
|
||||
|
||||
When a user requests a voice, describe it naturally using the appropriate attributes from above. Apply constraints where specified. Choose defaults when attributes aren't mentioned.
|
||||
|
||||
**Example mapping:**
|
||||
- "professional podcast host" → realistic male, 30s, american accent, warm timbre, conversational pacing, podcast domain
|
||||
- "AI robot voice" → creative, ai_machine_voice character, robotic timbre
|
||||
- "young excited instructor" → realistic, 20s, energetic emotion, education domain
|
||||
|
||||
|
||||
Few deterministic and verbose descriptions:
|
||||
- Realistic male voice in the 30s age with a american accent. Normal pitch, warm timbre, conversational pacing, neutral tone delivery at med intensity, podcast Domain, podcast_host role, neutral delivery
|
||||
- Creative, ai_machine_voice character. Male voice in their 20s with a american accent. Normal pitch, robotic timbre, conversational pacing, neutral tone at med intensity.
|
||||
165
special_tokens_map.json
Normal file
165
special_tokens_map.json
Normal file
@@ -0,0 +1,165 @@
|
||||
{
|
||||
"additional_special_tokens": [
|
||||
{
|
||||
"content": "<angry>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<appalled>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<chuckle>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<cry>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<curious>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<disappointed>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<excited>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<exhale>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<gasp>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<giggle>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<gulp>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<laugh>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<laugh_harder>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<mischievous>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<sarcastic>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<scream>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<sigh>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<sing>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<snort>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<whisper>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
}
|
||||
],
|
||||
"bos_token": {
|
||||
"content": "<|begin_of_text|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"eos_token": {
|
||||
"content": "<|eot_id|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"pad_token": {
|
||||
"content": "<custom_token_7>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
}
|
||||
}
|
||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:6c5e5b1d89b7e3738e5a5a4f93c326d8f3292ea83f9c560b8dbb6d66fb851973
|
||||
size 22853258
|
||||
93
tokenizer/chat_template.jinja
Normal file
93
tokenizer/chat_template.jinja
Normal file
@@ -0,0 +1,93 @@
|
||||
{{- bos_token }}
|
||||
{%- if custom_tools is defined %}
|
||||
{%- set tools = custom_tools %}
|
||||
{%- endif %}
|
||||
{%- if not tools_in_user_message is defined %}
|
||||
{%- set tools_in_user_message = true %}
|
||||
{%- endif %}
|
||||
{%- if not date_string is defined %}
|
||||
{%- if strftime_now is defined %}
|
||||
{%- set date_string = strftime_now("%d %b %Y") %}
|
||||
{%- else %}
|
||||
{%- set date_string = "26 Jul 2024" %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- if not tools is defined %}
|
||||
{%- set tools = none %}
|
||||
{%- endif %}
|
||||
|
||||
{#- This block extracts the system message, so we can slot it into the right place. #}
|
||||
{%- if messages[0]['role'] == 'system' %}
|
||||
{%- set system_message = messages[0]['content']|trim %}
|
||||
{%- set messages = messages[1:] %}
|
||||
{%- else %}
|
||||
{%- set system_message = "" %}
|
||||
{%- endif %}
|
||||
|
||||
{#- System message #}
|
||||
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
|
||||
{%- if tools is not none %}
|
||||
{{- "Environment: ipython\n" }}
|
||||
{%- endif %}
|
||||
{{- "Cutting Knowledge Date: December 2023\n" }}
|
||||
{{- "Today Date: " + date_string + "\n\n" }}
|
||||
{%- if tools is not none and not tools_in_user_message %}
|
||||
{{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
|
||||
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
|
||||
{{- "Do not use variables.\n\n" }}
|
||||
{%- for t in tools %}
|
||||
{{- t | tojson(indent=4) }}
|
||||
{{- "\n\n" }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- system_message }}
|
||||
{{- "<|eot_id|>" }}
|
||||
|
||||
{#- Custom tools are passed in a user message with some extra guidance #}
|
||||
{%- if tools_in_user_message and not tools is none %}
|
||||
{#- Extract the first user message so we can plug it in here #}
|
||||
{%- if messages | length != 0 %}
|
||||
{%- set first_user_message = messages[0]['content']|trim %}
|
||||
{%- set messages = messages[1:] %}
|
||||
{%- else %}
|
||||
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
|
||||
{%- endif %}
|
||||
{{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
|
||||
{{- "Given the following functions, please respond with a JSON for a function call " }}
|
||||
{{- "with its proper arguments that best answers the given prompt.\n\n" }}
|
||||
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
|
||||
{{- "Do not use variables.\n\n" }}
|
||||
{%- for t in tools %}
|
||||
{{- t | tojson(indent=4) }}
|
||||
{{- "\n\n" }}
|
||||
{%- endfor %}
|
||||
{{- first_user_message + "<|eot_id|>"}}
|
||||
{%- endif %}
|
||||
|
||||
{%- for message in messages %}
|
||||
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
|
||||
{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
|
||||
{%- elif 'tool_calls' in message %}
|
||||
{%- if not message.tool_calls|length == 1 %}
|
||||
{{- raise_exception("This model only supports single tool-calls at once!") }}
|
||||
{%- endif %}
|
||||
{%- set tool_call = message.tool_calls[0].function %}
|
||||
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
|
||||
{{- '{"name": "' + tool_call.name + '", ' }}
|
||||
{{- '"parameters": ' }}
|
||||
{{- tool_call.arguments | tojson }}
|
||||
{{- "}" }}
|
||||
{{- "<|eot_id|>" }}
|
||||
{%- elif message.role == "tool" or message.role == "ipython" %}
|
||||
{{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
|
||||
{%- if message.content is mapping or message.content is iterable %}
|
||||
{{- message.content | tojson }}
|
||||
{%- else %}
|
||||
{{- message.content }}
|
||||
{%- endif %}
|
||||
{{- "<|eot_id|>" }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
|
||||
{%- endif %}
|
||||
165
tokenizer/special_tokens_map.json
Normal file
165
tokenizer/special_tokens_map.json
Normal file
@@ -0,0 +1,165 @@
|
||||
{
|
||||
"additional_special_tokens": [
|
||||
{
|
||||
"content": "<angry>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<appalled>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<chuckle>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<cry>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<curious>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<disappointed>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<excited>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<exhale>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<gasp>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<giggle>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<gulp>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<laugh>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<laugh_harder>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<mischievous>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<sarcastic>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<scream>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<sigh>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<sing>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<snort>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<whisper>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
}
|
||||
],
|
||||
"bos_token": {
|
||||
"content": "<|begin_of_text|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"eos_token": {
|
||||
"content": "<|eot_id|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"pad_token": {
|
||||
"content": "<custom_token_7>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
}
|
||||
}
|
||||
3
tokenizer/tokenizer.json
Normal file
3
tokenizer/tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:6c5e5b1d89b7e3738e5a5a4f93c326d8f3292ea83f9c560b8dbb6d66fb851973
|
||||
size 22853258
|
||||
231717
tokenizer/tokenizer_config.json
Normal file
231717
tokenizer/tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
231717
tokenizer_config.json
Normal file
231717
tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
561
vllm_streaming_inference.py
Normal file
561
vllm_streaming_inference.py
Normal file
@@ -0,0 +1,561 @@
|
||||
"""
|
||||
Maya-1-Voice VLLM Streaming Inference - Standalone Reference Implementation
|
||||
|
||||
This is a complete, self-contained example for using Maya-1-Voice TTS model with VLLM and SNAC.
|
||||
Demonstrates streaming audio generation with sliding window approach for smooth playback.
|
||||
|
||||
Requirements:
|
||||
pip install vllm transformers torch snac numpy
|
||||
|
||||
Usage:
|
||||
python vllm_streaming_inference.py
|
||||
|
||||
Author: Maya-1-Voice Team
|
||||
License: MIT
|
||||
"""
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
import asyncio
|
||||
from typing import List, Optional, AsyncGenerator
|
||||
from transformers import AutoTokenizer
|
||||
from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
|
||||
from snac import SNAC
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# CONSTANTS
|
||||
# ============================================================================
|
||||
|
||||
# Special control tokens
|
||||
CODE_START_TOKEN_ID = 128257 # Start of Speech (SOS)
|
||||
CODE_END_TOKEN_ID = 128258 # End of Speech (EOS) - stop token for audio
|
||||
CODE_TOKEN_OFFSET = 128266 # Start of SNAC codes
|
||||
|
||||
# SNAC token range (7 tokens per frame, 4096 codes per level)
|
||||
SNAC_MIN_ID = 128266
|
||||
SNAC_MAX_ID = 156937 # 128266 + (7 * 4096) - 1
|
||||
|
||||
# SNAC configuration
|
||||
SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
|
||||
SNAC_SAMPLE_RATE = 24000
|
||||
SNAC_TOKENS_PER_FRAME = 7
|
||||
|
||||
# Generation parameters
|
||||
DEFAULT_TEMPERATURE = 0.4
|
||||
DEFAULT_TOP_P = 0.9
|
||||
DEFAULT_MAX_TOKENS = 2000
|
||||
DEFAULT_MIN_TOKENS = 28 # At least 4 SNAC frames
|
||||
DEFAULT_REPETITION_PENALTY = 1.1
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# SNAC DECODER
|
||||
# ============================================================================
|
||||
|
||||
class SNACDecoder:
|
||||
"""
|
||||
Decodes SNAC tokens (7-token frames) to audio waveforms.
|
||||
|
||||
The unpacking logic converts flat 7-token frames back to hierarchical
|
||||
3-level SNAC codes (matching the training preprocessing exactly).
|
||||
"""
|
||||
|
||||
def __init__(self, device: str = "cuda"):
|
||||
"""Initialize SNAC decoder with 24kHz model."""
|
||||
self.device = device
|
||||
print(f"🎵 Loading SNAC 24kHz model to {device}...")
|
||||
self.snac_model = SNAC.from_pretrained(SNAC_MODEL_NAME).eval().to(device)
|
||||
print(f"✅ SNAC decoder initialized")
|
||||
|
||||
def unpack_snac_from_7(self, vocab_ids: List[int]) -> List[List[int]]:
|
||||
"""
|
||||
Unpack 7-token SNAC frames to 3 hierarchical levels.
|
||||
|
||||
This is the EXACT INVERSE of training preprocessing.
|
||||
|
||||
Frame structure (7 tokens per frame):
|
||||
[slot0, slot1, slot2, slot3, slot4, slot5, slot6]
|
||||
|
||||
Unpacking to [L1, L2, L3]:
|
||||
- slot0 → L1[i] (coarse: 1x rate)
|
||||
- slot1 → L2[2*i] (medium: 2x rate, even)
|
||||
- slot2 → L3[4*i+0] (fine: 4x rate)
|
||||
- slot3 → L3[4*i+1]
|
||||
- slot4 → L2[2*i+1] (medium: odd)
|
||||
- slot5 → L3[4*i+2]
|
||||
- slot6 → L3[4*i+3]
|
||||
|
||||
Args:
|
||||
vocab_ids: List of SNAC token IDs (128266-156937), length divisible by 7
|
||||
|
||||
Returns:
|
||||
[L1, L2, L3] where L1=n, L2=2n, L3=4n elements
|
||||
"""
|
||||
# Remove EOS token if present
|
||||
if vocab_ids and vocab_ids[-1] == CODE_END_TOKEN_ID:
|
||||
vocab_ids = vocab_ids[:-1]
|
||||
|
||||
# Ensure complete frames
|
||||
frames = len(vocab_ids) // SNAC_TOKENS_PER_FRAME
|
||||
vocab_ids = vocab_ids[:frames * SNAC_TOKENS_PER_FRAME]
|
||||
|
||||
if frames == 0:
|
||||
return [[], [], []]
|
||||
|
||||
l1, l2, l3 = [], [], []
|
||||
|
||||
for i in range(frames):
|
||||
slots = vocab_ids[i*7:(i+1)*7]
|
||||
|
||||
# Subtract offset and mod 4096 to get original SNAC codes
|
||||
l1.append((slots[0] - CODE_TOKEN_OFFSET) % 4096)
|
||||
l2.extend([
|
||||
(slots[1] - CODE_TOKEN_OFFSET) % 4096, # Even
|
||||
(slots[4] - CODE_TOKEN_OFFSET) % 4096, # Odd
|
||||
])
|
||||
l3.extend([
|
||||
(slots[2] - CODE_TOKEN_OFFSET) % 4096,
|
||||
(slots[3] - CODE_TOKEN_OFFSET) % 4096,
|
||||
(slots[5] - CODE_TOKEN_OFFSET) % 4096,
|
||||
(slots[6] - CODE_TOKEN_OFFSET) % 4096,
|
||||
])
|
||||
|
||||
return [l1, l2, l3]
|
||||
|
||||
@torch.inference_mode()
|
||||
def decode(
|
||||
self,
|
||||
snac_tokens: List[int],
|
||||
use_sliding_window: bool = False
|
||||
) -> Optional[np.ndarray]:
|
||||
"""
|
||||
Decode SNAC tokens to audio waveform.
|
||||
|
||||
Args:
|
||||
snac_tokens: List of SNAC token IDs (7*n tokens)
|
||||
use_sliding_window: If True, return only middle 2048 samples
|
||||
(for smooth streaming without pops/clicks)
|
||||
|
||||
Returns:
|
||||
Audio waveform as float32 numpy array, 24kHz mono
|
||||
"""
|
||||
if len(snac_tokens) < SNAC_TOKENS_PER_FRAME:
|
||||
return None
|
||||
|
||||
# Unpack to 3 hierarchical levels
|
||||
levels = self.unpack_snac_from_7(snac_tokens)
|
||||
|
||||
if not levels[0]:
|
||||
return None
|
||||
|
||||
# Convert to tensors
|
||||
codes = [
|
||||
torch.tensor(level, dtype=torch.long, device=self.device).unsqueeze(0)
|
||||
for level in levels
|
||||
]
|
||||
|
||||
# Decode through SNAC quantizer + decoder
|
||||
z_q = self.snac_model.quantizer.from_codes(codes)
|
||||
audio = self.snac_model.decoder(z_q)
|
||||
|
||||
# Extract audio: [batch, 1, samples] → [samples]
|
||||
audio = audio[0, 0].cpu().numpy()
|
||||
|
||||
# Sliding window mode: keep middle 2048 samples only
|
||||
# This eliminates popping/cracking in streaming by overlapping windows
|
||||
if use_sliding_window and len(audio) >= 4096:
|
||||
audio = audio[2048:4096]
|
||||
|
||||
return audio
|
||||
|
||||
def decode_to_bytes(
|
||||
self,
|
||||
snac_tokens: List[int],
|
||||
use_sliding_window: bool = False
|
||||
) -> Optional[bytes]:
|
||||
"""
|
||||
Decode SNAC tokens to audio bytes (int16 PCM).
|
||||
|
||||
Args:
|
||||
snac_tokens: List of SNAC token IDs
|
||||
use_sliding_window: Use sliding window for smooth streaming
|
||||
|
||||
Returns:
|
||||
Audio as bytes (int16 PCM, 24kHz mono)
|
||||
"""
|
||||
audio = self.decode(snac_tokens, use_sliding_window=use_sliding_window)
|
||||
|
||||
if audio is None:
|
||||
return None
|
||||
|
||||
# Convert float32 to int16 PCM
|
||||
audio_int16 = (audio * 32767).astype(np.int16)
|
||||
return audio_int16.tobytes()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# CUSTOM LOGITS PROCESSOR
|
||||
# ============================================================================
|
||||
|
||||
class OnlyAudioAfterSOS:
|
||||
"""
|
||||
Restricts vocabulary to SNAC codes + EOS after SOS token.
|
||||
|
||||
This prevents the model from generating text tokens during audio phase,
|
||||
which would cause "hallucination" where the model repeats description text
|
||||
instead of generating proper audio codes.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._seen_sos = False
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
prompt_token_ids: List[int],
|
||||
generated_token_ids: List[int],
|
||||
logits: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Apply constraint: after SOS, only allow SNAC codes + EOS.
|
||||
|
||||
Args:
|
||||
prompt_token_ids: Original prompt token IDs
|
||||
generated_token_ids: Tokens generated so far
|
||||
logits: Logits for next token [vocab_size]
|
||||
|
||||
Returns:
|
||||
Modified logits with masked tokens
|
||||
"""
|
||||
# Check if SOS has been generated
|
||||
if not self._seen_sos:
|
||||
all_token_ids = prompt_token_ids + generated_token_ids
|
||||
if CODE_START_TOKEN_ID in all_token_ids:
|
||||
self._seen_sos = True
|
||||
else:
|
||||
return logits # No constraint yet
|
||||
|
||||
# Apply constraint: mask all tokens except SNAC codes + EOS
|
||||
mask = torch.full_like(logits, float('-inf'))
|
||||
mask[SNAC_MIN_ID:SNAC_MAX_ID + 1] = 0 # Allow SNAC codes
|
||||
mask[CODE_END_TOKEN_ID] = 0 # Allow EOS
|
||||
|
||||
return logits + mask
|
||||
|
||||
def reset(self):
|
||||
"""Reset state for reuse across generations."""
|
||||
self._seen_sos = False
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# MAYA-1-VOICE MODEL
|
||||
# ============================================================================
|
||||
|
||||
class Maya1VoiceModel:
|
||||
"""
|
||||
Maya-1-Voice TTS Model with VLLM inference engine.
|
||||
|
||||
Handles model loading, tokenizer initialization, and VLLM engine setup.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_path: str,
|
||||
dtype: str = "bfloat16",
|
||||
max_model_len: int = 8192,
|
||||
gpu_memory_utilization: float = 0.85,
|
||||
):
|
||||
"""
|
||||
Initialize Maya-1-Voice model with VLLM.
|
||||
|
||||
Args:
|
||||
model_path: Path to model checkpoint (local or HuggingFace)
|
||||
dtype: Model precision (bfloat16 recommended)
|
||||
max_model_len: Maximum sequence length
|
||||
gpu_memory_utilization: GPU memory fraction to use (0.0-1.0)
|
||||
"""
|
||||
self.model_path = model_path
|
||||
|
||||
print(f"🚀 Initializing Maya-1-Voice Model")
|
||||
print(f"📁 Model: {model_path}")
|
||||
print(f"🔢 Dtype: {dtype}")
|
||||
|
||||
# Load tokenizer (must be from checkpoint with emotion tags)
|
||||
print(f"📝 Loading tokenizer...")
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_path,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
print(f"✅ Tokenizer loaded: {len(self.tokenizer)} tokens")
|
||||
|
||||
# Initialize VLLM async engine
|
||||
print(f"🔧 Initializing VLLM engine...")
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=model_path,
|
||||
tokenizer=model_path,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
self.engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
print(f"✅ VLLM engine ready")
|
||||
|
||||
def build_prompt(self, description: str, text: str) -> str:
|
||||
"""
|
||||
Build prompt in Maya-1-Voice format using chat template.
|
||||
|
||||
Format: Chat template with <description="..."> text as content
|
||||
|
||||
The model expects:
|
||||
1. Description of voice/character
|
||||
2. Text to synthesize (optionally with <emotion> tags)
|
||||
|
||||
Args:
|
||||
description: Voice description
|
||||
Example: "Realistic male voice in the 30s age with american accent.
|
||||
Normal pitch, warm timbre, conversational pacing."
|
||||
text: Text to synthesize
|
||||
Example: "Hello world! <excited> This is amazing!"
|
||||
|
||||
Returns:
|
||||
Formatted prompt string using chat template
|
||||
"""
|
||||
content = f'<description="{description}"> {text}'
|
||||
messages = [{"role": "user", "content": content}]
|
||||
return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# STREAMING PIPELINE
|
||||
# ============================================================================
|
||||
|
||||
class Maya1VoiceStreamingPipeline:
|
||||
"""
|
||||
Streaming TTS pipeline using sliding window approach.
|
||||
|
||||
This generates smooth audio by:
|
||||
1. Streaming tokens from VLLM as they're generated
|
||||
2. Every 7 tokens, decoding the last 28 tokens (4 frames) - sliding window
|
||||
3. Keeping only middle 2048 samples from each decode
|
||||
4. Creating natural overlap between chunks for artifact-free playback
|
||||
"""
|
||||
|
||||
def __init__(self, model: Maya1VoiceModel, snac_decoder: SNACDecoder):
|
||||
"""Initialize streaming pipeline."""
|
||||
self.model = model
|
||||
self.snac_decoder = snac_decoder
|
||||
print(f"🌊 Maya-1-Voice Streaming Pipeline initialized")
|
||||
|
||||
async def generate_speech_stream(
|
||||
self,
|
||||
description: str,
|
||||
text: str,
|
||||
temperature: float = DEFAULT_TEMPERATURE,
|
||||
top_p: float = DEFAULT_TOP_P,
|
||||
max_tokens: int = DEFAULT_MAX_TOKENS,
|
||||
repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
|
||||
) -> AsyncGenerator[bytes, None]:
|
||||
"""
|
||||
Generate speech audio with streaming.
|
||||
|
||||
Args:
|
||||
description: Voice/character description
|
||||
text: Text to synthesize (with optional <emotion> tags)
|
||||
temperature: Sampling temperature (lower = more stable)
|
||||
top_p: Nucleus sampling
|
||||
max_tokens: Max SNAC tokens to generate
|
||||
repetition_penalty: Prevent repetition loops
|
||||
|
||||
Yields:
|
||||
Audio chunks as bytes (int16 PCM, 24kHz mono)
|
||||
"""
|
||||
print(f"\n🌊 Starting streaming generation")
|
||||
print(f"📝 Description: {description[:80]}...")
|
||||
print(f"💬 Text: {text}")
|
||||
|
||||
# Build prompt
|
||||
prompt = self.model.build_prompt(description, text)
|
||||
|
||||
# Configure sampling (removed custom logits processor for V1 compatibility)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
max_tokens=max_tokens,
|
||||
min_tokens=DEFAULT_MIN_TOKENS,
|
||||
repetition_penalty=repetition_penalty,
|
||||
stop_token_ids=[CODE_END_TOKEN_ID], # Stop on audio EOS
|
||||
)
|
||||
|
||||
print(f"🎲 Sampling: temp={temperature}, top_p={top_p}, max_tokens={max_tokens}")
|
||||
|
||||
# Token buffer for sliding window
|
||||
token_buffer = []
|
||||
total_tokens = 0
|
||||
total_chunks = 0
|
||||
|
||||
# Generate with VLLM
|
||||
import uuid
|
||||
import time
|
||||
request_id = f"maya1voice-{uuid.uuid4().hex[:8]}-{int(time.time() * 1000000)}"
|
||||
|
||||
results_generator = self.model.engine.generate(
|
||||
prompt=prompt,
|
||||
sampling_params=sampling_params,
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
# Stream tokens with sliding window decoding
|
||||
async for request_output in results_generator:
|
||||
generated_ids = request_output.outputs[0].token_ids
|
||||
|
||||
# Process only new tokens
|
||||
new_tokens = generated_ids[total_tokens:]
|
||||
total_tokens = len(generated_ids)
|
||||
|
||||
# Filter and buffer SNAC tokens only
|
||||
for token_id in new_tokens:
|
||||
if SNAC_MIN_ID <= token_id <= SNAC_MAX_ID:
|
||||
token_buffer.append(token_id)
|
||||
|
||||
# Sliding window: process every 7 tokens when buffer > 27
|
||||
# Take last 28 tokens (4 frames) for smooth overlap
|
||||
if len(token_buffer) % 7 == 0 and len(token_buffer) > 27:
|
||||
window_tokens = token_buffer[-28:]
|
||||
|
||||
# Decode with sliding window (returns middle 2048 samples)
|
||||
audio_bytes = self.snac_decoder.decode_to_bytes(
|
||||
window_tokens,
|
||||
use_sliding_window=True
|
||||
)
|
||||
|
||||
if audio_bytes:
|
||||
total_chunks += 1
|
||||
if total_chunks == 1:
|
||||
print(f"🎵 First chunk decoded ({len(audio_bytes)} bytes)")
|
||||
yield audio_bytes
|
||||
|
||||
print(f"✅ Streaming complete: {total_tokens} tokens → {total_chunks} chunks")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# MAIN EXAMPLE
|
||||
# ============================================================================
|
||||
|
||||
async def main():
|
||||
"""
|
||||
Example usage of Maya-1-Voice streaming inference.
|
||||
|
||||
This demonstrates:
|
||||
1. Model initialization
|
||||
2. SNAC decoder setup
|
||||
3. Streaming generation
|
||||
4. Audio chunk handling
|
||||
"""
|
||||
|
||||
# Configuration
|
||||
MODEL_PATH = "/home/ubuntu/veena_temp/maya-1-voice" # Local model path
|
||||
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
print("=" * 80)
|
||||
print("Maya-1-Voice VLLM Streaming Inference Example")
|
||||
print("=" * 80)
|
||||
|
||||
# Initialize model
|
||||
model = Maya1VoiceModel(
|
||||
model_path=MODEL_PATH,
|
||||
dtype="bfloat16",
|
||||
max_model_len=8192,
|
||||
gpu_memory_utilization=0.8, # Reduced for available GPU memory (12GB free)
|
||||
)
|
||||
|
||||
# Initialize SNAC decoder
|
||||
snac_decoder = SNACDecoder(device=DEVICE)
|
||||
|
||||
# Create pipeline
|
||||
pipeline = Maya1VoiceStreamingPipeline(model, snac_decoder)
|
||||
|
||||
# Example 1: Professional voice
|
||||
description = (
|
||||
"Realistic male voice in the 30s age with american accent. "
|
||||
"Normal pitch, warm timbre, conversational pacing, neutral tone delivery at med intensity."
|
||||
)
|
||||
text = "Hello! This is a test of the Maya-1-Voice text-to-speech system."
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print("Example 1: Professional Voice")
|
||||
print(f"{'='*80}")
|
||||
|
||||
audio_chunks = []
|
||||
async for chunk in pipeline.generate_speech_stream(
|
||||
description=description,
|
||||
text=text,
|
||||
temperature=0.4,
|
||||
max_tokens=500,
|
||||
):
|
||||
audio_chunks.append(chunk)
|
||||
print(f"📦 Received chunk {len(audio_chunks)}: {len(chunk)} bytes")
|
||||
|
||||
# Combine chunks
|
||||
full_audio = b''.join(audio_chunks)
|
||||
print(f"\n✅ Total audio: {len(full_audio)} bytes ({len(full_audio)//2} samples, {len(full_audio)/2/24000:.2f}s)")
|
||||
|
||||
# Save audio (optional)
|
||||
try:
|
||||
import wave
|
||||
output_file = "output_example1.wav"
|
||||
with wave.open(output_file, 'wb') as wav:
|
||||
wav.setnchannels(1) # Mono
|
||||
wav.setsampwidth(2) # 16-bit
|
||||
wav.setframerate(24000) # 24kHz
|
||||
wav.writeframes(full_audio)
|
||||
print(f"💾 Saved to {output_file}")
|
||||
except ImportError:
|
||||
print(f"⚠️ Install 'wave' module to save audio files")
|
||||
|
||||
# Example 2: Character voice with emotions
|
||||
print(f"\n{'='*80}")
|
||||
print("Example 2: Character Voice with Emotions")
|
||||
print(f"{'='*80}")
|
||||
|
||||
description = (
|
||||
"Creative, dark_villain character. Male voice in their 40s with british accent. "
|
||||
"Low pitch, gravelly timbre, slow pacing, angry tone at high intensity."
|
||||
)
|
||||
text = "The darkness isn't coming... <angry> it's already here!"
|
||||
|
||||
audio_chunks = []
|
||||
async for chunk in pipeline.generate_speech_stream(
|
||||
description=description,
|
||||
text=text,
|
||||
temperature=0.5,
|
||||
max_tokens=800,
|
||||
):
|
||||
audio_chunks.append(chunk)
|
||||
print(f"📦 Received chunk {len(audio_chunks)}: {len(chunk)} bytes")
|
||||
|
||||
full_audio = b''.join(audio_chunks)
|
||||
print(f"\n✅ Total audio: {len(full_audio)} bytes ({len(full_audio)//2} samples, {len(full_audio)/2/24000:.2f}s)")
|
||||
|
||||
# Save audio
|
||||
try:
|
||||
import wave
|
||||
output_file = "output_example2.wav"
|
||||
with wave.open(output_file, 'wb') as wav:
|
||||
wav.setnchannels(1)
|
||||
wav.setsampwidth(2)
|
||||
wav.setframerate(24000)
|
||||
wav.writeframes(full_audio)
|
||||
print(f"💾 Saved to {output_file}")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print("🎉 Examples complete!")
|
||||
print(f"{'='*80}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run async main
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user