Text-to-Speech Pipeline Intermediate

The TTS (Text-to-Speech) stage converts your system's text responses into natural-sounding audio. Modern neural TTS has made robotic voices a thing of the past — but choosing the right engine, optimizing for streaming latency, and controlling voice characteristics requires careful engineering. This lesson covers production TTS providers, voice cloning, SSML markup, and streaming audio delivery.

TTS Engine Comparison

Provider	Quality	Streaming	First-Byte Latency	Cost per 1M chars	Voice Cloning	Best For
ElevenLabs	Excellent	Yes	~150ms	$30-300	Yes (instant + pro)	Best quality, voice cloning, emotional range
Azure Neural TTS	Very Good	Yes	~200ms	$16	Yes (Custom Neural Voice)	Enterprise, SSML control, 400+ voices
Google Cloud TTS	Very Good	Yes	~250ms	$16	No	Multi-language, stable API, Journey voices
Amazon Polly	Good	Yes	~300ms	$4-16	No	Cheapest at scale, NTTS voices
OpenAI TTS	Very Good	Yes	~300ms	$15	No	Simple API, good default voices
XTTS (Self-hosted)	Good-Excellent	Partial	~500ms*	GPU cost only	Yes (zero-shot)	Data privacy, unlimited usage, custom control

Streaming TTS with ElevenLabs (Production Code)

import aiohttp
import asyncio
from typing import AsyncIterator
from dataclasses import dataclass

@dataclass
class TTSConfig:
    voice_id: str = "21m00Tcm4TlvDq8ikWAM"  # Rachel
    model_id: str = "eleven_turbo_v2"         # Fastest model
    stability: float = 0.5                     # 0=variable, 1=stable
    similarity_boost: float = 0.75             # Voice similarity
    style: float = 0.0                         # Style exaggeration
    use_speaker_boost: bool = True             # Enhance clarity
    output_format: str = "pcm_16000"           # Raw PCM for real-time

class ElevenLabsStreamingTTS:
    """Production streaming TTS with ElevenLabs.

    Streams audio chunks as they're generated - user hears the first word
    while the rest is still being synthesized.
    """

    BASE_URL = "https://api.elevenlabs.io/v1"

    def __init__(self, api_key: str, config: TTSConfig = None):
        self.api_key = api_key
        self.config = config or TTSConfig()

    async def synthesize_streaming(self, text: str) -> AsyncIterator[bytes]:
        """Stream TTS audio chunks as they're generated.

        First audio chunk arrives in ~150ms. Each subsequent chunk
        arrives every ~50-100ms. Total audio for a sentence: ~1-3 seconds.
        """
        url = (f"{self.BASE_URL}/text-to-speech/{self.config.voice_id}"
               f"/stream?output_format={self.config.output_format}")

        payload = {
            "text": text,
            "model_id": self.config.model_id,
            "voice_settings": {
                "stability": self.config.stability,
                "similarity_boost": self.config.similarity_boost,
                "style": self.config.style,
                "use_speaker_boost": self.config.use_speaker_boost
            }
        }

        headers = {
            "xi-api-key": self.api_key,
            "Content-Type": "application/json"
        }

        async with aiohttp.ClientSession() as session:
            async with session.post(url, json=payload, headers=headers) as resp:
                if resp.status != 200:
                    error = await resp.text()
                    raise RuntimeError(f"TTS failed ({resp.status}): {error}")

                # Stream audio chunks as they arrive
                async for chunk in resp.content.iter_chunked(4096):
                    if chunk:
                        yield chunk

    async def synthesize_full(self, text: str) -> bytes:
        """Get complete audio (non-streaming). Use for short prompts."""
        chunks = []
        async for chunk in self.synthesize_streaming(text):
            chunks.append(chunk)
        return b"".join(chunks)


# --- Usage in voice pipeline ---
async def speak_response(text: str):
    """Synthesize and play response with streaming for low latency."""
    tts = ElevenLabsStreamingTTS(api_key="YOUR_KEY")
    audio_player = AudioPlayer(sample_rate=16000)

    # Start playing audio before full synthesis is complete
    async for audio_chunk in tts.synthesize_streaming(text):
        audio_player.queue_chunk(audio_chunk)

    await audio_player.wait_until_done()

SSML for Fine-Grained Voice Control

SSML (Speech Synthesis Markup Language) gives you precise control over how text is spoken. Supported by Azure, Google, and Amazon Polly.

# SSML examples for production voice applications

class SSMLBuilder:
    """Build SSML markup for voice applications.

    SSML gives you control over pauses, emphasis, pronunciation,
    speaking rate, and more - things plain text cannot express.
    """

    def __init__(self):
        self.parts = []

    def add_text(self, text: str) -> "SSMLBuilder":
        self.parts.append(text)
        return self

    def add_pause(self, duration_ms: int) -> "SSMLBuilder":
        """Insert a pause. Use after confirmations, before important info."""
        self.parts.append(f'<break time="{duration_ms}ms"/>')
        return self

    def add_emphasis(self, text: str,
                     level: str = "moderate") -> "SSMLBuilder":
        """Emphasize a word. Levels: reduced, moderate, strong."""
        self.parts.append(f'<emphasis level="{level}">{text}</emphasis>')
        return self

    def add_spell_out(self, text: str) -> "SSMLBuilder":
        """Spell out text character by character. Use for IDs, codes."""
        self.parts.append(f'<say-as interpret-as="characters">{text}</say-as>')
        return self

    def add_number(self, number: str,
                   format: str = "cardinal") -> "SSMLBuilder":
        """Speak number in specific format: cardinal, ordinal, telephone."""
        self.parts.append(
            f'<say-as interpret-as="{format}">{number}</say-as>'
        )
        return self

    def add_date(self, date: str, format: str = "mdy") -> "SSMLBuilder":
        """Speak a date. Formats: mdy, dmy, ymd, md, dm, ym, my, d, m, y."""
        self.parts.append(
            f'<say-as interpret-as="date" format="{format}">{date}</say-as>'
        )
        return self

    def set_rate(self, text: str, rate: str = "medium") -> "SSMLBuilder":
        """Control speaking rate. Options: x-slow, slow, medium, fast, x-fast."""
        self.parts.append(f'<prosody rate="{rate}">{text}</prosody>')
        return self

    def set_pitch(self, text: str, pitch: str = "medium") -> "SSMLBuilder":
        """Control pitch. Options: x-low, low, medium, high, x-high."""
        self.parts.append(f'<prosody pitch="{pitch}">{text}</prosody>')
        return self

    def build(self) -> str:
        content = " ".join(self.parts)
        return f'<speak>{content}</speak>'


# --- Production SSML examples ---

# Example 1: Order confirmation (IVR)
order_ssml = (SSMLBuilder()
    .add_text("Your order")
    .add_spell_out("A B C")
    .add_number("1234", "characters")
    .add_pause(300)
    .add_text("has been confirmed.")
    .add_pause(500)
    .add_text("The total is")
    .add_emphasis("$47.99", "moderate")
    .add_pause(200)
    .add_text("and your estimated delivery date is")
    .add_date("03/25/2026", "mdy")
    .build()
)

# Example 2: Account balance (phone banking)
balance_ssml = (SSMLBuilder()
    .add_text("Your current account balance is")
    .add_pause(300)
    .add_emphasis("$12,450.32", "strong")
    .add_pause(500)
    .set_rate("Your last transaction was a debit of $89.99 at Amazon.", "slow")
    .build()
)

Voice Cloning Architecture

# Voice cloning workflow for production applications

class VoiceCloningPipeline:
    """Production voice cloning pipeline.

    Two approaches:
    1. Instant clone: 30 seconds of audio, decent quality
    2. Professional clone: 30+ minutes of audio, high quality

    Ethical requirements:
    - Written consent from voice owner
    - Watermarking of cloned audio
    - Usage restrictions in ToS
    """

    def __init__(self, provider: str = "elevenlabs"):
        self.provider = provider

    async def instant_clone(self, audio_samples: list[bytes],
                            name: str, description: str) -> str:
        """Create an instant voice clone from short audio samples.

        Args:
            audio_samples: List of audio files (30s-5min total)
            name: Name for the cloned voice
            description: Description of voice characteristics

        Returns:
            voice_id: ID to use in TTS calls
        """
        # ElevenLabs instant clone API
        form_data = aiohttp.FormData()
        form_data.add_field("name", name)
        form_data.add_field("description", description)
        for i, sample in enumerate(audio_samples):
            form_data.add_field(
                "files", sample,
                filename=f"sample_{i}.wav",
                content_type="audio/wav"
            )

        async with aiohttp.ClientSession() as session:
            async with session.post(
                "https://api.elevenlabs.io/v1/voices/add",
                data=form_data,
                headers={"xi-api-key": self.api_key}
            ) as resp:
                result = await resp.json()
                return result["voice_id"]

    def get_recording_guidelines(self) -> dict:
        """Guidelines for collecting voice samples for cloning."""
        return {
            "minimum_duration": "30 seconds for instant, 30 minutes for professional",
            "sample_rate": "44100 Hz (44.1 kHz)",
            "format": "WAV, 16-bit, mono",
            "environment": "Quiet room, no echo, no background noise",
            "microphone": "Condenser mic or high-quality headset",
            "content": [
                "Read diverse text (news, stories, technical content)",
                "Include questions, exclamations, and statements",
                "Vary emotion: neutral, happy, concerned, professional",
                "Include domain-specific terms the voice will commonly say"
            ],
            "consent": "REQUIRED: Written consent from the voice owner",
            "legal": "Add voice watermarking, maintain audit trail"
        }

TTS Latency Optimization

# Strategies to minimize TTS latency in production

class TTSLatencyOptimizer:
    """Production techniques for minimizing TTS latency."""

    def __init__(self, tts_client):
        self.tts = tts_client
        self.cache = {}  # In production, use Redis

    async def chunk_and_stream(self, text: str) -> AsyncIterator[bytes]:
        """Split long responses into sentences, stream each independently.

        Instead of waiting for full response TTS, start speaking the first
        sentence while generating audio for subsequent sentences.
        """
        sentences = self._split_sentences(text)

        for sentence in sentences:
            # Check cache first
            cache_key = self._cache_key(sentence)
            if cache_key in self.cache:
                yield self.cache[cache_key]
                continue

            # Generate and stream
            audio = await self.tts.synthesize_full(sentence)
            self.cache[cache_key] = audio
            yield audio

    def precache_common_responses(self, responses: list[str]):
        """Pre-generate audio for common responses (greetings, errors, etc.).

        Cache these at startup to eliminate TTS latency for frequent utterances.
        """
        common = [
            "Hello, how can I help you today?",
            "Could you please repeat that?",
            "I'm sorry, I didn't understand.",
            "Let me look that up for you.",
            "Is there anything else I can help with?",
            "Thank you for calling. Goodbye.",
            "Please hold while I transfer your call.",
            "I'm connecting you with a live agent now.",
        ]
        for text in common + responses:
            # Pre-generate in background
            asyncio.create_task(self._warm_cache(text))

    async def _warm_cache(self, text: str):
        cache_key = self._cache_key(text)
        if cache_key not in self.cache:
            audio = await self.tts.synthesize_full(text)
            self.cache[cache_key] = audio

    def _split_sentences(self, text: str) -> list[str]:
        """Split text into sentences for incremental TTS."""
        import re
        sentences = re.split(r'(?<=[.!?])\s+', text)
        return [s.strip() for s in sentences if s.strip()]

    def _cache_key(self, text: str) -> str:
        import hashlib
        return hashlib.md5(text.encode()).hexdigest()

Production Tip: Pre-cache your top 20 most common responses (greetings, hold messages, error messages, goodbyes). This eliminates TTS latency entirely for ~30-40% of utterances in a typical IVR system. Store the cached audio in Redis with the text hash as the key.

← Speech-to-Text Pipeline Voice Dialog Management →