diff --git a/examples/540-livekit-voice-agent-python/.env.example b/examples/540-livekit-voice-agent-python/.env.example new file mode 100644 index 0000000..c9198b1 --- /dev/null +++ b/examples/540-livekit-voice-agent-python/.env.example @@ -0,0 +1,10 @@ +# Deepgram API key for speech-to-text and text-to-speech +DEEPGRAM_API_KEY= + +# LiveKit server connection +LIVEKIT_URL=wss://your-livekit-server.livekit.cloud +LIVEKIT_API_KEY= +LIVEKIT_API_SECRET= + +# OpenAI API key for the LLM +OPENAI_API_KEY= diff --git a/examples/540-livekit-voice-agent-python/BLOG.md b/examples/540-livekit-voice-agent-python/BLOG.md new file mode 100644 index 0000000..8e42536 --- /dev/null +++ b/examples/540-livekit-voice-agent-python/BLOG.md @@ -0,0 +1,418 @@ +# Building a Real-Time Voice Agent with LiveKit and Deepgram + +In this tutorial, we'll build a voice-based AI assistant that can have natural conversations with users in real-time. We'll use LiveKit's Agents framework for the real-time communication infrastructure, Deepgram for speech recognition and synthesis, and OpenAI for the conversational intelligence. + +## What We're Building + +Our voice agent will: +- Join a LiveKit room and listen for user speech +- Transcribe speech in real-time using Deepgram's Nova-3 model +- Generate intelligent responses using OpenAI's GPT-4o-mini +- Speak responses back using Deepgram's Aura-2 voices +- Handle natural turn-taking and interruptions + +The end result is a conversational AI you can talk to just like a human—with low latency and natural voice quality. + +## Prerequisites + +Before we start, make sure you have: +- Python 3.10 or later +- A Deepgram account ([sign up free](https://console.deepgram.com/)) +- A LiveKit Cloud account ([get started](https://cloud.livekit.io/)) +- An OpenAI account with API access + +## Project Setup + +Let's start by creating our project structure: + +```bash +mkdir livekit-voice-agent +cd livekit-voice-agent +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate +``` + +Create a `requirements.txt` file with our dependencies: + +``` +# LiveKit Agents framework +livekit-agents>=1.5.0 + +# Deepgram plugins for STT and TTS +livekit-plugins-deepgram>=1.5.0 + +# OpenAI plugin for LLM +livekit-plugins-openai>=1.5.0 + +# Silero plugin for Voice Activity Detection +livekit-plugins-silero>=1.5.0 + +# Additional dependencies +python-dotenv>=1.0.0 +``` + +Install the dependencies: + +```bash +pip install -r requirements.txt +``` + +## Understanding the Architecture + +Before we write code, let's understand how LiveKit Agents works: + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ LiveKit Room │ +│ │ +│ ┌──────────┐ Audio ┌──────────────────────────────────┐ │ +│ │ User │ ──────────► │ Voice Agent │ │ +│ │ (Browser)│ │ │ │ +│ │ │ ◄────────── │ VAD → STT → LLM → TTS │ │ +│ └──────────┘ Audio │ (Deepgram) (Deepgram) │ │ +│ └──────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +1. **User connects** to a LiveKit room via browser +2. **Audio streams** to the agent in real-time +3. **VAD** (Voice Activity Detection) detects when the user speaks +4. **STT** (Deepgram Nova-3) transcribes speech to text +5. **LLM** (OpenAI) generates a response +6. **TTS** (Deepgram Aura-2) synthesizes speech +7. **Audio streams** back to the user + +The LiveKit Agents framework handles all the complexity of managing these streams, detecting turns, handling interruptions, and more. + +## Creating the Agent + +Create `src/agent.py`: + +```python +""" +LiveKit Voice Agent with Deepgram STT and TTS. +""" + +import logging +from livekit.agents import Agent, AgentSession, JobContext +from livekit.agents.cli import run_app +from livekit.plugins import deepgram, openai, silero + +logger = logging.getLogger("voice-agent") + + +async def entrypoint(ctx: JobContext) -> None: + """Main entrypoint for the agent session.""" + + logger.info(f"Agent joining room: {ctx.room.name}") + + # Connect to the LiveKit room + await ctx.connect() + + # Initialize the Deepgram STT plugin + deepgram_stt = deepgram.STT( + model="nova-3", + language="en-US", + interim_results=True, + punctuate=True, + filler_words=True, + endpointing_ms=25, + ) + + # Initialize the Deepgram TTS plugin + deepgram_tts = deepgram.TTS( + model="aura-2-andromeda-en", + sample_rate=24000, + ) + + # Initialize the OpenAI LLM + openai_llm = openai.LLM( + model="gpt-4o-mini", + temperature=0.7, + ) + + # Initialize Voice Activity Detection + vad = silero.VAD.load() + + # Create the agent with instructions + agent = Agent( + instructions="""You are a helpful voice assistant powered by Deepgram and LiveKit. + +Your role is to: +- Have natural, friendly conversations with users +- Answer questions clearly and concisely +- Be helpful and informative +- Keep responses brief since this is a voice conversation + +Remember: You're speaking, not writing. Keep your responses conversational.""", + ) + + # Create and start the agent session + session = AgentSession( + stt=deepgram_stt, + tts=deepgram_tts, + llm=openai_llm, + vad=vad, + ) + + await session.start(room=ctx.room, agent=agent) + logger.info("Agent session started successfully") + + +if __name__ == "__main__": + from livekit.agents import AgentServer + + server = AgentServer() + + @server.rtc_session + async def _entrypoint(ctx: JobContext) -> None: + await entrypoint(ctx) + + run_app(server) +``` + +Let's break down the key components: + +### Deepgram STT Configuration + +```python +deepgram_stt = deepgram.STT( + model="nova-3", # Latest and most accurate model + language="en-US", # Primary language + interim_results=True, # Get partial transcripts as user speaks + punctuate=True, # Automatic punctuation + filler_words=True, # Include "um", "uh" for natural turn detection + endpointing_ms=25, # Quick response to silence +) +``` + +Nova-3 is Deepgram's latest model with the best accuracy. The `interim_results=True` setting is crucial for responsive agents—it lets the agent start processing before the user finishes speaking. + +### Deepgram TTS Configuration + +```python +deepgram_tts = deepgram.TTS( + model="aura-2-andromeda-en", # Natural female voice + sample_rate=24000, # High quality audio +) +``` + +Aura-2 is Deepgram's latest text-to-speech model, offering natural-sounding voices with low latency. The `andromeda-en` voice is conversational and works well for assistant use cases. + +### Agent Instructions + +The agent's instructions shape its personality and behavior. For voice agents, it's important to emphasize: +- **Brevity**: Long responses are tiresome to listen to +- **Conversational tone**: Written text sounds robotic when spoken +- **Clarity**: Avoid complex sentence structures + +## Environment Configuration + +Create a `.env` file (copy from `.env.example`): + +```bash +# Deepgram API key +DEEPGRAM_API_KEY=your_deepgram_api_key + +# LiveKit credentials +LIVEKIT_URL=wss://your-app.livekit.cloud +LIVEKIT_API_KEY=your_livekit_api_key +LIVEKIT_API_SECRET=your_livekit_api_secret + +# OpenAI API key +OPENAI_API_KEY=your_openai_api_key +``` + +### Getting Your API Keys + +**Deepgram:** +1. Go to [console.deepgram.com](https://console.deepgram.com/) +2. Create a new API key with STT and TTS permissions + +**LiveKit:** +1. Go to [cloud.livekit.io](https://cloud.livekit.io/) +2. Create a new project +3. Copy the URL, API Key, and API Secret from the Settings page + +**OpenAI:** +1. Go to [platform.openai.com](https://platform.openai.com/) +2. Create a new API key + +## Running the Agent + +Start the agent in development mode: + +```bash +python src/agent.py dev +``` + +You'll see output like: + +``` +INFO - Starting agent in development mode +INFO - Agent server listening on port 8081 +INFO - Registered with LiveKit server +``` + +Now open the [LiveKit Agents Playground](https://agents-playground.livekit.io/): + +1. Enter your LiveKit URL +2. Click "Connect" +3. Grant microphone access +4. Start talking! + +The agent will transcribe your speech, generate a response, and speak back to you. + +## Understanding the Agent Pipeline + +When you speak, here's what happens: + +1. **Audio capture**: LiveKit captures your microphone audio +2. **VAD processing**: Silero VAD detects speech boundaries +3. **Streaming STT**: Deepgram transcribes audio in real-time +4. **Turn detection**: Agent decides when you've finished speaking +5. **LLM inference**: OpenAI generates a response +6. **Streaming TTS**: Deepgram synthesizes speech +7. **Audio playback**: LiveKit plays audio to your speakers + +The framework handles all of this automatically, including: +- **Interruption handling**: If you speak while the agent is talking, it stops +- **Turn management**: Natural conversation flow +- **Error recovery**: Automatic retries on transient failures + +## Customizing the Agent + +### Changing the Voice + +Deepgram offers several Aura-2 voices: + +```python +deepgram_tts = deepgram.TTS( + model="aura-2-orion-en", # Male voice + # model="aura-2-luna-en", # Female, more casual + # model="aura-2-stella-en", # British female +) +``` + +### Adjusting STT Sensitivity + +For noisy environments, adjust endpointing: + +```python +deepgram_stt = deepgram.STT( + model="nova-3", + endpointing_ms=100, # Wait longer before ending turn + filler_words=False, # Ignore filler words +) +``` + +### Multi-language Support + +Deepgram supports 30+ languages: + +```python +deepgram_stt = deepgram.STT( + model="nova-3", + language="es", # Spanish + # language="fr", # French + # language="de", # German +) +``` + +### Adding Tools + +Give your agent capabilities: + +```python +from livekit.agents import llm + +@llm.function_tool +async def get_weather(location: str) -> str: + """Get the current weather for a location.""" + # Implementation here + return f"The weather in {location} is sunny and 72°F" + +agent = Agent( + instructions="You are a helpful assistant that can check the weather.", + tools=[get_weather], +) +``` + +## Testing Your Agent + +Create `tests/test_deepgram_connection.py` to verify your setup: + +```python +import asyncio +import os +import aiohttp +from livekit.plugins import deepgram + +async def test_tts(): + """Test Deepgram TTS.""" + async with aiohttp.ClientSession() as session: + tts = deepgram.TTS( + model="aura-2-andromeda-en", + http_session=session, + ) + + audio_bytes = 0 + async for chunk in tts.synthesize("Hello, this is a test."): + audio_bytes += len(chunk.frame.data) + + print(f"✓ TTS generated {audio_bytes} bytes of audio") + +if __name__ == "__main__": + asyncio.run(test_tts()) +``` + +Run the test: + +```bash +python tests/test_deepgram_connection.py +``` + +## Production Deployment + +For production, run the agent in server mode: + +```bash +python src/agent.py start +``` + +This registers the agent with your LiveKit server. When users join rooms, LiveKit automatically dispatches agents to serve them. + +### Scaling Considerations + +- **Multiple workers**: Run multiple agent processes for high availability +- **Load balancing**: LiveKit handles distribution automatically +- **Monitoring**: Use the built-in Prometheus metrics endpoint + +## What's Next + +Now that you have a basic voice agent working, consider: + +1. **Add memory**: Store conversation history for context +2. **Implement tools**: Give the agent capabilities like web search, calendar access +3. **Custom wake words**: Trigger the agent with specific phrases +4. **Sentiment analysis**: Adjust responses based on user emotion +5. **Multi-modal**: Add video understanding using the video sampler + +## Resources + +- [LiveKit Agents Documentation](https://docs.livekit.io/agents/) +- [Deepgram API Reference](https://developers.deepgram.com/reference/) +- [LiveKit Agents GitHub](https://github.com/livekit/agents) +- [Deepgram Nova-3 Announcement](https://deepgram.com/learn/nova-3-speech-to-text-api) +- [Deepgram Aura-2 TTS](https://deepgram.com/learn/aura-2-text-to-speech-api) + +## Conclusion + +You've built a fully functional voice AI agent that combines: +- LiveKit's real-time infrastructure +- Deepgram's industry-leading speech AI +- OpenAI's conversational intelligence + +The LiveKit Agents framework handles the complexity of real-time voice applications, letting you focus on building great experiences. Deepgram's low-latency STT and natural TTS make conversations feel fluid and natural. + +Happy building! 🎙️ diff --git a/examples/540-livekit-voice-agent-python/README.md b/examples/540-livekit-voice-agent-python/README.md new file mode 100644 index 0000000..69f5dae --- /dev/null +++ b/examples/540-livekit-voice-agent-python/README.md @@ -0,0 +1,160 @@ +# LiveKit Voice Agent with Deepgram STT/TTS + +A real-time voice AI agent using LiveKit Agents framework with Deepgram for speech-to-text and text-to-speech. + +![Screenshot](./screenshot.png) + +## What This Does + +This example creates a voice-based AI assistant that: +- Joins a LiveKit room and listens for user speech +- Uses **Deepgram Nova-3** for real-time speech recognition +- Processes user input with **OpenAI GPT-4o-mini** (configurable) +- Responds with natural voice using **Deepgram Aura-2** text-to-speech +- Handles turn-taking and interruptions automatically + +## Prerequisites + +- Python 3.10+ +- A [Deepgram account](https://console.deepgram.com/) with API key +- A [LiveKit Cloud account](https://cloud.livekit.io/) or self-hosted LiveKit server +- An [OpenAI account](https://platform.openai.com/) with API key + +## Environment Variables + +| Variable | Description | +|----------|-------------| +| `DEEPGRAM_API_KEY` | Your Deepgram API key for STT and TTS | +| `LIVEKIT_URL` | LiveKit server WebSocket URL (e.g., `wss://your-app.livekit.cloud`) | +| `LIVEKIT_API_KEY` | LiveKit API key | +| `LIVEKIT_API_SECRET` | LiveKit API secret | +| `OPENAI_API_KEY` | OpenAI API key for the LLM | + +## Quick Start + +1. **Clone and navigate to the example:** + ```bash + cd examples/540-livekit-voice-agent-python + ``` + +2. **Create a virtual environment:** + ```bash + python -m venv venv + source venv/bin/activate # On Windows: venv\Scripts\activate + ``` + +3. **Install dependencies:** + ```bash + pip install -r requirements.txt + ``` + +4. **Configure environment variables:** + ```bash + cp .env.example .env + # Edit .env with your API keys + ``` + +5. **Run the agent in development mode:** + ```bash + python src/agent.py dev + ``` + +6. **Connect to the agent:** + - Open the [LiveKit Playground](https://agents-playground.livekit.io/) + - Enter your LiveKit URL + - Click "Connect" to join the same room as the agent + - Start speaking! + +## How It Works + +The agent uses the LiveKit Agents framework pipeline: + +``` +User Speech → Deepgram STT → OpenAI LLM → Deepgram TTS → Audio Output + ↑ ↓ + (Nova-3) (Aura-2) +``` + +1. **Voice Activity Detection (VAD)**: Silero VAD detects when the user starts/stops speaking +2. **Speech-to-Text**: Deepgram Nova-3 transcribes user speech in real-time +3. **LLM Processing**: OpenAI generates a response based on conversation history +4. **Text-to-Speech**: Deepgram Aura-2 synthesizes natural-sounding audio +5. **Turn Management**: LiveKit Agents handles interruptions and turn-taking + +## Running in Production + +For production deployment: + +```bash +python src/agent.py start +``` + +This registers the agent with your LiveKit server so it can be dispatched to rooms automatically. + +## Configuration Options + +### Deepgram STT Options + +```python +deepgram.STT( + model="nova-3", # Latest Deepgram model + language="en-US", # Language code + interim_results=True, # Enable partial transcripts + punctuate=True, # Add punctuation + filler_words=True, # Include um, uh, etc. + endpointing_ms=25, # Silence detection threshold +) +``` + +### Deepgram TTS Options + +```python +deepgram.TTS( + model="aura-2-andromeda-en", # Voice model + sample_rate=24000, # Audio sample rate +) +``` + +### Available Deepgram TTS Voices + +- `aura-2-andromeda-en` - Female, American English +- `aura-2-orion-en` - Male, American English +- `aura-2-luna-en` - Female, American English (conversational) +- `aura-2-stella-en` - Female, British English +- `aura-2-athena-en` - Female, British English + +## Testing + +Run the test suite: + +```bash +python tests/run_tests.py +``` + +Tests verify: +- Deepgram STT connection and transcription +- Deepgram TTS connection and audio generation +- Plugin initialization +- Agent module structure + +## Troubleshooting + +### "Connection error" on startup +- Verify your `DEEPGRAM_API_KEY` is correct +- Check your internet connection +- Ensure the API key has STT and TTS permissions + +### Agent doesn't respond to speech +- Check that your microphone is working in the LiveKit Playground +- Verify the VAD is detecting speech (check console logs) +- Ensure `OPENAI_API_KEY` is set correctly + +### Audio quality issues +- Try adjusting `sample_rate` in TTS settings +- Check your network latency to LiveKit server + +## Resources + +- [LiveKit Agents Documentation](https://docs.livekit.io/agents/) +- [Deepgram Documentation](https://developers.deepgram.com/) +- [LiveKit Deepgram Plugin](https://github.com/livekit/agents/tree/main/livekit-plugins/livekit-plugins-deepgram) diff --git a/examples/540-livekit-voice-agent-python/requirements.txt b/examples/540-livekit-voice-agent-python/requirements.txt new file mode 100644 index 0000000..f5c4ecf --- /dev/null +++ b/examples/540-livekit-voice-agent-python/requirements.txt @@ -0,0 +1,14 @@ +# LiveKit Agents framework +livekit-agents>=1.5.0 + +# Deepgram plugins for STT and TTS +livekit-plugins-deepgram>=1.5.0 + +# OpenAI plugin for LLM +livekit-plugins-openai>=1.5.0 + +# Silero plugin for Voice Activity Detection +livekit-plugins-silero>=1.5.0 + +# Additional dependencies +python-dotenv>=1.0.0 diff --git a/examples/540-livekit-voice-agent-python/screenshot.png b/examples/540-livekit-voice-agent-python/screenshot.png new file mode 100644 index 0000000..1782dd4 Binary files /dev/null and b/examples/540-livekit-voice-agent-python/screenshot.png differ diff --git a/examples/540-livekit-voice-agent-python/src/__init__.py b/examples/540-livekit-voice-agent-python/src/__init__.py new file mode 100644 index 0000000..e9c3044 --- /dev/null +++ b/examples/540-livekit-voice-agent-python/src/__init__.py @@ -0,0 +1 @@ +# LiveKit Voice Agent with Deepgram diff --git a/examples/540-livekit-voice-agent-python/src/agent.py b/examples/540-livekit-voice-agent-python/src/agent.py new file mode 100644 index 0000000..d514032 --- /dev/null +++ b/examples/540-livekit-voice-agent-python/src/agent.py @@ -0,0 +1,102 @@ +""" +LiveKit Voice Agent with Deepgram STT and TTS. + +This example demonstrates how to build a voice-based AI agent using: +- LiveKit Agents framework for real-time communication +- Deepgram for speech-to-text (STT) using Nova-3 +- Deepgram for text-to-speech (TTS) using Aura-2 +- OpenAI for the LLM (GPT-4o) + +The agent joins a LiveKit room and can have a real-time voice conversation +with users who connect to the same room. +""" + +import logging +from livekit.agents import Agent, AgentSession, JobContext, JobProcess +from livekit.agents.cli import run_app +from livekit.plugins import deepgram, openai, silero + + +# Configure logging +logger = logging.getLogger("voice-agent") + + +async def entrypoint(ctx: JobContext) -> None: + """Main entrypoint for the agent session.""" + + logger.info(f"Agent joining room: {ctx.room.name}") + + # Connect to the LiveKit room + await ctx.connect() + + # Initialize the Deepgram STT (Speech-to-Text) plugin + # Nova-3 is Deepgram's latest and most accurate model + deepgram_stt = deepgram.STT( + model="nova-3", + language="en-US", + interim_results=True, + punctuate=True, + filler_words=True, + endpointing_ms=25, + ) + + # Initialize the Deepgram TTS (Text-to-Speech) plugin + # Aura-2 provides natural, conversational voice synthesis + deepgram_tts = deepgram.TTS( + model="aura-2-andromeda-en", + sample_rate=24000, + ) + + # Initialize the OpenAI LLM + openai_llm = openai.LLM( + model="gpt-4o-mini", + temperature=0.7, + ) + + # Initialize Voice Activity Detection (VAD) using Silero + # This helps detect when the user starts and stops speaking + vad = silero.VAD.load() + + # Create the agent with custom instructions + agent = Agent( + instructions="""You are a helpful voice assistant powered by Deepgram and LiveKit. + +Your role is to: +- Have natural, friendly conversations with users +- Answer questions clearly and concisely +- Be helpful and informative +- Keep responses brief since this is a voice conversation + +Remember: You're speaking, not writing. Keep your responses conversational and +avoid overly long explanations. If you need to explain something complex, +break it into digestible parts and check if the user wants more detail.""", + ) + + # Create and start the agent session + session = AgentSession( + stt=deepgram_stt, + tts=deepgram_tts, + llm=openai_llm, + vad=vad, + ) + + # Start the agent in the room + await session.start( + room=ctx.room, + agent=agent, + ) + + logger.info("Agent session started successfully") + + +if __name__ == "__main__": + # Create and run the agent server + from livekit.agents import AgentServer + + server = AgentServer() + + @server.rtc_session + async def _entrypoint(ctx: JobContext) -> None: + await entrypoint(ctx) + + run_app(server) diff --git a/examples/540-livekit-voice-agent-python/tests/__init__.py b/examples/540-livekit-voice-agent-python/tests/__init__.py new file mode 100644 index 0000000..92c15a6 --- /dev/null +++ b/examples/540-livekit-voice-agent-python/tests/__init__.py @@ -0,0 +1 @@ +# Tests for LiveKit Voice Agent diff --git a/examples/540-livekit-voice-agent-python/tests/run_tests.py b/examples/540-livekit-voice-agent-python/tests/run_tests.py new file mode 100644 index 0000000..21de839 --- /dev/null +++ b/examples/540-livekit-voice-agent-python/tests/run_tests.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +""" +Test runner that runs all tests and reports results. + +Exit codes: +- 0: All tests passed +- 1: Some tests failed +- 2: Missing credentials (tests skipped) +""" + +import asyncio +import os +import subprocess +import sys +from pathlib import Path + + +def run_test(test_file: str) -> int: + """Run a single test file and return exit code.""" + print(f"\n{'='*60}") + print(f"Running: {test_file}") + print('='*60 + "\n") + + result = subprocess.run( + [sys.executable, test_file], + cwd=Path(__file__).parent, + env=os.environ.copy(), + ) + + return result.returncode + + +def main() -> int: + """Run all tests.""" + test_dir = Path(__file__).parent + + # List of test files to run + test_files = [ + "test_deepgram_connection.py", + "test_livekit_connection.py", + "test_integration.py", + ] + + results = {} + has_failure = False + all_skipped = True + + for test_file in test_files: + test_path = test_dir / test_file + if test_path.exists(): + exit_code = run_test(str(test_path)) + results[test_file] = exit_code + + if exit_code == 1: + has_failure = True + if exit_code != 2: + all_skipped = False + + # Summary + print("\n" + "="*60) + print("TEST SUMMARY") + print("="*60) + + for test_file, code in results.items(): + status = {0: "PASS ✓", 1: "FAIL ✗", 2: "SKIP ⊘"}.get(code, f"UNKNOWN ({code})") + print(f" {test_file}: {status}") + + print() + + if has_failure: + print("Some tests failed!") + return 1 + elif all_skipped: + print("All tests skipped due to missing credentials") + return 2 + else: + print("All tests passed!") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/540-livekit-voice-agent-python/tests/test_deepgram_connection.py b/examples/540-livekit-voice-agent-python/tests/test_deepgram_connection.py new file mode 100644 index 0000000..64767dd --- /dev/null +++ b/examples/540-livekit-voice-agent-python/tests/test_deepgram_connection.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +""" +Test Deepgram STT and TTS connections. + +This test verifies that: +1. The Deepgram API key is valid +2. Deepgram STT (speech-to-text) works correctly +3. Deepgram TTS (text-to-speech) works correctly + +Exit codes: +- 0: All tests passed +- 1: Test failed +- 2: Missing credentials (skip) +""" + +import asyncio +import os +import sys +import aiohttp + + +def check_credentials() -> bool: + """Check if required credentials are available.""" + deepgram_key = os.environ.get("DEEPGRAM_API_KEY") + if not deepgram_key: + print("SKIP: DEEPGRAM_API_KEY not set") + return False + return True + + +async def test_deepgram_tts() -> bool: + """Test Deepgram TTS by synthesizing audio.""" + from livekit.plugins import deepgram + + print("Testing Deepgram TTS...") + + async with aiohttp.ClientSession() as session: + try: + tts = deepgram.TTS( + model="aura-2-andromeda-en", + sample_rate=24000, + http_session=session, + ) + + # Synthesize a test phrase + test_text = "Hello! This is a test of Deepgram text to speech." + + audio_data = [] + async for chunk in tts.synthesize(test_text): + audio_data.append(chunk.frame.data) + + # Verify we got audio data back + total_bytes = sum(len(chunk) for chunk in audio_data) + + if total_bytes > 0: + print(f" ✓ TTS generated {total_bytes} bytes of audio") + return True + else: + print(" ✗ TTS returned no audio data") + return False + + except Exception as e: + print(f" ✗ TTS error: {e}") + return False + + +async def test_deepgram_stt() -> bool: + """Test Deepgram STT by transcribing audio.""" + from livekit.plugins import deepgram + from livekit.agents.stt import SpeechEventType + + print("Testing Deepgram STT...") + + async with aiohttp.ClientSession() as session: + try: + stt = deepgram.STT( + model="nova-3", + language="en-US", + punctuate=True, + http_session=session, + ) + + # First generate some audio using TTS + tts = deepgram.TTS( + model="aura-2-andromeda-en", + sample_rate=16000, # Match STT sample rate + http_session=session, + ) + + test_phrase = "The quick brown fox jumps over the lazy dog." + + # Collect audio frames from TTS + audio_frames = [] + async for chunk in tts.synthesize(test_phrase): + audio_frames.append(chunk.frame) + + if not audio_frames: + print(" ✗ No audio frames generated for STT test") + return False + + print(f" - Generated {len(audio_frames)} audio frames for transcription") + + # Create STT stream + stream = stt.stream() + + # Push audio frames to the stream + for frame in audio_frames: + stream.push_frame(frame) + + # Signal end of input + stream.end_input() + + # Collect transcription results + final_transcript = "" + async for event in stream: + if event.type == SpeechEventType.FINAL_TRANSCRIPT and event.alternatives: + final_transcript += event.alternatives[0].text + " " + + final_transcript = final_transcript.strip() + + if final_transcript: + print(f" ✓ STT transcribed: '{final_transcript}'") + + # Check if transcription is reasonably accurate + original_words = set(test_phrase.lower().replace(".", "").split()) + transcribed_words = set(final_transcript.lower().replace(".", "").split()) + + common_words = original_words.intersection(transcribed_words) + + if len(common_words) >= 3: + print(f" ✓ Transcription accuracy check passed ({len(common_words)} matching words)") + return True + else: + print(f" ⚠ Transcription may be inaccurate (only {len(common_words)} matching words)") + return True # Still pass - Deepgram is working + else: + print(" ✗ STT returned empty transcription") + return False + + except Exception as e: + print(f" ✗ STT error: {e}") + import traceback + traceback.print_exc() + return False + + +async def test_plugin_initialization() -> bool: + """Test that Deepgram plugins initialize correctly.""" + from livekit.plugins import deepgram + + print("Testing plugin initialization...") + + async with aiohttp.ClientSession() as session: + try: + # Test STT initialization + stt = deepgram.STT( + model="nova-3", + language="en-US", + http_session=session, + ) + print(f" ✓ STT initialized - model: {stt.model}, provider: {stt.provider}") + + # Test TTS initialization + tts = deepgram.TTS( + model="aura-2-andromeda-en", + http_session=session, + ) + print(f" ✓ TTS initialized - model: {tts.model}, provider: {tts.provider}") + + return True + + except Exception as e: + print(f" ✗ Plugin initialization error: {e}") + return False + + +async def main() -> int: + """Run all tests.""" + print("=" * 60) + print("Deepgram Connection Tests") + print("=" * 60) + print() + + # Check credentials + if not check_credentials(): + return 2 + + results = [] + + # Run tests + results.append(await test_plugin_initialization()) + results.append(await test_deepgram_tts()) + results.append(await test_deepgram_stt()) + + print() + print("=" * 60) + + if all(results): + print("All tests passed! ✓") + return 0 + else: + passed = sum(results) + total = len(results) + print(f"Tests: {passed}/{total} passed") + return 1 + + +if __name__ == "__main__": + exit_code = asyncio.run(main()) + sys.exit(exit_code) diff --git a/examples/540-livekit-voice-agent-python/tests/test_integration.py b/examples/540-livekit-voice-agent-python/tests/test_integration.py new file mode 100644 index 0000000..acb1e3a --- /dev/null +++ b/examples/540-livekit-voice-agent-python/tests/test_integration.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +""" +Integration test for the LiveKit Voice Agent. + +This test verifies that: +1. All plugins can be loaded and configured +2. An AgentSession can be created with Deepgram STT/TTS +3. The agent module imports correctly + +Exit codes: +- 0: All tests passed +- 1: Test failed +- 2: Missing credentials (skip) +""" + +import asyncio +import os +import sys + + +def check_credentials() -> bool: + """Check if required credentials are available.""" + if not os.environ.get("DEEPGRAM_API_KEY"): + print("SKIP: DEEPGRAM_API_KEY not set") + return False + return True + + +async def test_imports() -> bool: + """Test that all required modules can be imported.""" + print("Testing module imports...") + + try: + from livekit.agents import Agent, AgentSession, JobContext + from livekit.agents.cli import run_app + from livekit.plugins import deepgram, silero + + print(" ✓ All modules imported successfully") + return True + + except ImportError as e: + print(f" ✗ Import error: {e}") + return False + + +async def test_agent_session_creation() -> bool: + """Test creating an AgentSession with Deepgram plugins.""" + print("Testing AgentSession creation...") + + try: + from livekit.agents import Agent, AgentSession + from livekit.plugins import deepgram, silero + + # Initialize Deepgram STT + stt = deepgram.STT( + model="nova-3", + language="en-US", + interim_results=True, + punctuate=True, + ) + + # Initialize Deepgram TTS + tts = deepgram.TTS( + model="aura-2-andromeda-en", + sample_rate=24000, + ) + + # Initialize VAD + vad = silero.VAD.load() + + # Create agent + agent = Agent( + instructions="You are a helpful assistant.", + ) + + # Create session (without starting it) + session = AgentSession( + stt=stt, + tts=tts, + vad=vad, + ) + + print(" ✓ AgentSession created successfully") + print(f" - STT model: {stt.model}") + print(f" - TTS model: {tts.model}") + + return True + + except Exception as e: + print(f" ✗ AgentSession creation error: {e}") + import traceback + traceback.print_exc() + return False + + +async def test_deepgram_models() -> bool: + """Test that Deepgram models are available.""" + print("Testing Deepgram models...") + + try: + from livekit.plugins.deepgram import models + + # Check if expected models are available + print(f" - Available model definitions loaded") + + from livekit.plugins import deepgram + + # Verify STT model options + stt = deepgram.STT(model="nova-3") + print(f" ✓ Nova-3 STT model available") + + # Verify TTS model options + tts = deepgram.TTS(model="aura-2-andromeda-en") + print(f" ✓ Aura-2 TTS model available") + + return True + + except Exception as e: + print(f" ✗ Model test error: {e}") + return False + + +async def test_agent_module_import() -> bool: + """Test importing the main agent module.""" + print("Testing agent module...") + + try: + # Add src to path + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) + + # Import should not fail + import agent + + # Verify entrypoint exists + if hasattr(agent, 'entrypoint'): + print(" ✓ Agent module loaded, entrypoint found") + return True + else: + print(" ✗ Agent module missing entrypoint") + return False + + except Exception as e: + print(f" ✗ Agent module error: {e}") + import traceback + traceback.print_exc() + return False + + +async def main() -> int: + """Run all tests.""" + print("=" * 60) + print("Integration Tests") + print("=" * 60) + print() + + # Check credentials + if not check_credentials(): + return 2 + + results = [] + + # Run tests + results.append(await test_imports()) + results.append(await test_deepgram_models()) + results.append(await test_agent_session_creation()) + results.append(await test_agent_module_import()) + + print() + print("=" * 60) + + if all(results): + print("All tests passed! ✓") + return 0 + else: + passed = sum(results) + total = len(results) + print(f"Tests: {passed}/{total} passed") + return 1 + + +if __name__ == "__main__": + exit_code = asyncio.run(main()) + sys.exit(exit_code) diff --git a/examples/540-livekit-voice-agent-python/tests/test_livekit_connection.py b/examples/540-livekit-voice-agent-python/tests/test_livekit_connection.py new file mode 100644 index 0000000..d34515e --- /dev/null +++ b/examples/540-livekit-voice-agent-python/tests/test_livekit_connection.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +""" +Test LiveKit server connection. + +This test verifies that: +1. LiveKit credentials are valid +2. Can connect to the LiveKit server + +Exit codes: +- 0: All tests passed +- 1: Test failed +- 2: Missing credentials (skip) +""" + +import asyncio +import os +import sys + + +def check_credentials() -> bool: + """Check if required credentials are available.""" + missing = [] + + if not os.environ.get("LIVEKIT_URL"): + missing.append("LIVEKIT_URL") + if not os.environ.get("LIVEKIT_API_KEY"): + missing.append("LIVEKIT_API_KEY") + if not os.environ.get("LIVEKIT_API_SECRET"): + missing.append("LIVEKIT_API_SECRET") + + if missing: + print(f"SKIP: Missing LiveKit credentials: {', '.join(missing)}") + return False + return True + + +async def test_livekit_api() -> bool: + """Test LiveKit API connection by listing rooms.""" + from livekit import api + + print("Testing LiveKit API connection...") + + try: + livekit_api = api.LiveKitAPI( + url=os.environ.get("LIVEKIT_URL"), + api_key=os.environ.get("LIVEKIT_API_KEY"), + api_secret=os.environ.get("LIVEKIT_API_SECRET"), + ) + + # List rooms to verify connection + rooms = await livekit_api.room.list_rooms(api.ListRoomsRequest()) + + print(f" ✓ Connected to LiveKit server") + print(f" - Active rooms: {len(rooms.rooms)}") + + await livekit_api.aclose() + return True + + except Exception as e: + print(f" ✗ LiveKit API error: {e}") + return False + + +async def test_token_generation() -> bool: + """Test LiveKit access token generation.""" + from livekit import api + + print("Testing token generation...") + + try: + api_key = os.environ.get("LIVEKIT_API_KEY") + api_secret = os.environ.get("LIVEKIT_API_SECRET") + + # Create an access token + token = api.AccessToken(api_key, api_secret) + token.with_identity("test-agent") + token.with_name("Test Agent") + token.with_grants(api.VideoGrants( + room_join=True, + room="test-room", + can_publish=True, + can_subscribe=True, + )) + + jwt = token.to_jwt() + + if jwt and len(jwt) > 100: + print(f" ✓ Token generated successfully ({len(jwt)} chars)") + return True + else: + print(" ✗ Token generation produced invalid token") + return False + + except Exception as e: + print(f" ✗ Token generation error: {e}") + return False + + +async def main() -> int: + """Run all tests.""" + print("=" * 60) + print("LiveKit Connection Tests") + print("=" * 60) + print() + + # Check credentials + if not check_credentials(): + return 2 + + results = [] + + # Run tests + results.append(await test_token_generation()) + results.append(await test_livekit_api()) + + print() + print("=" * 60) + + if all(results): + print("All tests passed! ✓") + return 0 + else: + passed = sum(results) + total = len(results) + print(f"Tests: {passed}/{total} passed") + return 1 + + +if __name__ == "__main__": + exit_code = asyncio.run(main()) + sys.exit(exit_code)