Text-to-Speech Pipeline Intermediate
The TTS (Text-to-Speech) stage converts your system's text responses into natural-sounding audio. Modern neural TTS has made robotic voices a thing of the past — but choosing the right engine, optimizing for streaming latency, and controlling voice characteristics requires careful engineering. This lesson covers production TTS providers, voice cloning, SSML markup, and streaming audio delivery.
TTS Engine Comparison
| Provider | Quality | Streaming | First-Byte Latency | Cost per 1M chars | Voice Cloning | Best For |
|---|---|---|---|---|---|---|
| ElevenLabs | Excellent | Yes | ~150ms | $30-300 | Yes (instant + pro) | Best quality, voice cloning, emotional range |
| Azure Neural TTS | Very Good | Yes | ~200ms | $16 | Yes (Custom Neural Voice) | Enterprise, SSML control, 400+ voices |
| Google Cloud TTS | Very Good | Yes | ~250ms | $16 | No | Multi-language, stable API, Journey voices |
| Amazon Polly | Good | Yes | ~300ms | $4-16 | No | Cheapest at scale, NTTS voices |
| OpenAI TTS | Very Good | Yes | ~300ms | $15 | No | Simple API, good default voices |
| XTTS (Self-hosted) | Good-Excellent | Partial | ~500ms* | GPU cost only | Yes (zero-shot) | Data privacy, unlimited usage, custom control |
Streaming TTS with ElevenLabs (Production Code)
import aiohttp
import asyncio
from typing import AsyncIterator
from dataclasses import dataclass
@dataclass
class TTSConfig:
voice_id: str = "21m00Tcm4TlvDq8ikWAM" # Rachel
model_id: str = "eleven_turbo_v2" # Fastest model
stability: float = 0.5 # 0=variable, 1=stable
similarity_boost: float = 0.75 # Voice similarity
style: float = 0.0 # Style exaggeration
use_speaker_boost: bool = True # Enhance clarity
output_format: str = "pcm_16000" # Raw PCM for real-time
class ElevenLabsStreamingTTS:
"""Production streaming TTS with ElevenLabs.
Streams audio chunks as they're generated - user hears the first word
while the rest is still being synthesized.
"""
BASE_URL = "https://api.elevenlabs.io/v1"
def __init__(self, api_key: str, config: TTSConfig = None):
self.api_key = api_key
self.config = config or TTSConfig()
async def synthesize_streaming(self, text: str) -> AsyncIterator[bytes]:
"""Stream TTS audio chunks as they're generated.
First audio chunk arrives in ~150ms. Each subsequent chunk
arrives every ~50-100ms. Total audio for a sentence: ~1-3 seconds.
"""
url = (f"{self.BASE_URL}/text-to-speech/{self.config.voice_id}"
f"/stream?output_format={self.config.output_format}")
payload = {
"text": text,
"model_id": self.config.model_id,
"voice_settings": {
"stability": self.config.stability,
"similarity_boost": self.config.similarity_boost,
"style": self.config.style,
"use_speaker_boost": self.config.use_speaker_boost
}
}
headers = {
"xi-api-key": self.api_key,
"Content-Type": "application/json"
}
async with aiohttp.ClientSession() as session:
async with session.post(url, json=payload, headers=headers) as resp:
if resp.status != 200:
error = await resp.text()
raise RuntimeError(f"TTS failed ({resp.status}): {error}")
# Stream audio chunks as they arrive
async for chunk in resp.content.iter_chunked(4096):
if chunk:
yield chunk
async def synthesize_full(self, text: str) -> bytes:
"""Get complete audio (non-streaming). Use for short prompts."""
chunks = []
async for chunk in self.synthesize_streaming(text):
chunks.append(chunk)
return b"".join(chunks)
# --- Usage in voice pipeline ---
async def speak_response(text: str):
"""Synthesize and play response with streaming for low latency."""
tts = ElevenLabsStreamingTTS(api_key="YOUR_KEY")
audio_player = AudioPlayer(sample_rate=16000)
# Start playing audio before full synthesis is complete
async for audio_chunk in tts.synthesize_streaming(text):
audio_player.queue_chunk(audio_chunk)
await audio_player.wait_until_done()
SSML for Fine-Grained Voice Control
SSML (Speech Synthesis Markup Language) gives you precise control over how text is spoken. Supported by Azure, Google, and Amazon Polly.
# SSML examples for production voice applications
class SSMLBuilder:
"""Build SSML markup for voice applications.
SSML gives you control over pauses, emphasis, pronunciation,
speaking rate, and more - things plain text cannot express.
"""
def __init__(self):
self.parts = []
def add_text(self, text: str) -> "SSMLBuilder":
self.parts.append(text)
return self
def add_pause(self, duration_ms: int) -> "SSMLBuilder":
"""Insert a pause. Use after confirmations, before important info."""
self.parts.append(f'<break time="{duration_ms}ms"/>')
return self
def add_emphasis(self, text: str,
level: str = "moderate") -> "SSMLBuilder":
"""Emphasize a word. Levels: reduced, moderate, strong."""
self.parts.append(f'<emphasis level="{level}">{text}</emphasis>')
return self
def add_spell_out(self, text: str) -> "SSMLBuilder":
"""Spell out text character by character. Use for IDs, codes."""
self.parts.append(f'<say-as interpret-as="characters">{text}</say-as>')
return self
def add_number(self, number: str,
format: str = "cardinal") -> "SSMLBuilder":
"""Speak number in specific format: cardinal, ordinal, telephone."""
self.parts.append(
f'<say-as interpret-as="{format}">{number}</say-as>'
)
return self
def add_date(self, date: str, format: str = "mdy") -> "SSMLBuilder":
"""Speak a date. Formats: mdy, dmy, ymd, md, dm, ym, my, d, m, y."""
self.parts.append(
f'<say-as interpret-as="date" format="{format}">{date}</say-as>'
)
return self
def set_rate(self, text: str, rate: str = "medium") -> "SSMLBuilder":
"""Control speaking rate. Options: x-slow, slow, medium, fast, x-fast."""
self.parts.append(f'<prosody rate="{rate}">{text}</prosody>')
return self
def set_pitch(self, text: str, pitch: str = "medium") -> "SSMLBuilder":
"""Control pitch. Options: x-low, low, medium, high, x-high."""
self.parts.append(f'<prosody pitch="{pitch}">{text}</prosody>')
return self
def build(self) -> str:
content = " ".join(self.parts)
return f'<speak>{content}</speak>'
# --- Production SSML examples ---
# Example 1: Order confirmation (IVR)
order_ssml = (SSMLBuilder()
.add_text("Your order")
.add_spell_out("A B C")
.add_number("1234", "characters")
.add_pause(300)
.add_text("has been confirmed.")
.add_pause(500)
.add_text("The total is")
.add_emphasis("$47.99", "moderate")
.add_pause(200)
.add_text("and your estimated delivery date is")
.add_date("03/25/2026", "mdy")
.build()
)
# Example 2: Account balance (phone banking)
balance_ssml = (SSMLBuilder()
.add_text("Your current account balance is")
.add_pause(300)
.add_emphasis("$12,450.32", "strong")
.add_pause(500)
.set_rate("Your last transaction was a debit of $89.99 at Amazon.", "slow")
.build()
)
Voice Cloning Architecture
# Voice cloning workflow for production applications
class VoiceCloningPipeline:
"""Production voice cloning pipeline.
Two approaches:
1. Instant clone: 30 seconds of audio, decent quality
2. Professional clone: 30+ minutes of audio, high quality
Ethical requirements:
- Written consent from voice owner
- Watermarking of cloned audio
- Usage restrictions in ToS
"""
def __init__(self, provider: str = "elevenlabs"):
self.provider = provider
async def instant_clone(self, audio_samples: list[bytes],
name: str, description: str) -> str:
"""Create an instant voice clone from short audio samples.
Args:
audio_samples: List of audio files (30s-5min total)
name: Name for the cloned voice
description: Description of voice characteristics
Returns:
voice_id: ID to use in TTS calls
"""
# ElevenLabs instant clone API
form_data = aiohttp.FormData()
form_data.add_field("name", name)
form_data.add_field("description", description)
for i, sample in enumerate(audio_samples):
form_data.add_field(
"files", sample,
filename=f"sample_{i}.wav",
content_type="audio/wav"
)
async with aiohttp.ClientSession() as session:
async with session.post(
"https://api.elevenlabs.io/v1/voices/add",
data=form_data,
headers={"xi-api-key": self.api_key}
) as resp:
result = await resp.json()
return result["voice_id"]
def get_recording_guidelines(self) -> dict:
"""Guidelines for collecting voice samples for cloning."""
return {
"minimum_duration": "30 seconds for instant, 30 minutes for professional",
"sample_rate": "44100 Hz (44.1 kHz)",
"format": "WAV, 16-bit, mono",
"environment": "Quiet room, no echo, no background noise",
"microphone": "Condenser mic or high-quality headset",
"content": [
"Read diverse text (news, stories, technical content)",
"Include questions, exclamations, and statements",
"Vary emotion: neutral, happy, concerned, professional",
"Include domain-specific terms the voice will commonly say"
],
"consent": "REQUIRED: Written consent from the voice owner",
"legal": "Add voice watermarking, maintain audit trail"
}
TTS Latency Optimization
# Strategies to minimize TTS latency in production
class TTSLatencyOptimizer:
"""Production techniques for minimizing TTS latency."""
def __init__(self, tts_client):
self.tts = tts_client
self.cache = {} # In production, use Redis
async def chunk_and_stream(self, text: str) -> AsyncIterator[bytes]:
"""Split long responses into sentences, stream each independently.
Instead of waiting for full response TTS, start speaking the first
sentence while generating audio for subsequent sentences.
"""
sentences = self._split_sentences(text)
for sentence in sentences:
# Check cache first
cache_key = self._cache_key(sentence)
if cache_key in self.cache:
yield self.cache[cache_key]
continue
# Generate and stream
audio = await self.tts.synthesize_full(sentence)
self.cache[cache_key] = audio
yield audio
def precache_common_responses(self, responses: list[str]):
"""Pre-generate audio for common responses (greetings, errors, etc.).
Cache these at startup to eliminate TTS latency for frequent utterances.
"""
common = [
"Hello, how can I help you today?",
"Could you please repeat that?",
"I'm sorry, I didn't understand.",
"Let me look that up for you.",
"Is there anything else I can help with?",
"Thank you for calling. Goodbye.",
"Please hold while I transfer your call.",
"I'm connecting you with a live agent now.",
]
for text in common + responses:
# Pre-generate in background
asyncio.create_task(self._warm_cache(text))
async def _warm_cache(self, text: str):
cache_key = self._cache_key(text)
if cache_key not in self.cache:
audio = await self.tts.synthesize_full(text)
self.cache[cache_key] = audio
def _split_sentences(self, text: str) -> list[str]:
"""Split text into sentences for incremental TTS."""
import re
sentences = re.split(r'(?<=[.!?])\s+', text)
return [s.strip() for s in sentences if s.strip()]
def _cache_key(self, text: str) -> str:
import hashlib
return hashlib.md5(text.encode()).hexdigest()
Production Tip: Pre-cache your top 20 most common responses (greetings, hold messages, error messages, goodbyes). This eliminates TTS latency entirely for ~30-40% of utterances in a typical IVR system. Store the cached audio in Redis with the text hash as the key.
Lilly Tech Systems