diff --git a/examples/310-crewai-voice-agents-python/.env.example b/examples/310-crewai-voice-agents-python/.env.example new file mode 100644 index 0000000..42779e0 --- /dev/null +++ b/examples/310-crewai-voice-agents-python/.env.example @@ -0,0 +1,7 @@ +# Deepgram — used for STT (nova-3) and TTS (aura-2) +# https://console.deepgram.com/ +DEEPGRAM_API_KEY= + +# OpenAI — used as the LLM backend for CrewAI agents +# https://platform.openai.com/api-keys +OPENAI_API_KEY= diff --git a/examples/310-crewai-voice-agents-python/README.md b/examples/310-crewai-voice-agents-python/README.md new file mode 100644 index 0000000..42f7a97 --- /dev/null +++ b/examples/310-crewai-voice-agents-python/README.md @@ -0,0 +1,81 @@ +# CrewAI Voice-Enabled Multi-Agent System with Deepgram + +Build a multi-agent voice pipeline using CrewAI and Deepgram. A crew of three AI agents collaborates to process spoken audio: one transcribes speech with Deepgram STT, one analyses the content, and one delivers the response as spoken audio via Deepgram TTS. + +## What you'll build + +A Python application where a CrewAI crew processes audio end-to-end: a Voice Listener agent transcribes audio using Deepgram nova-3, a Research Analyst agent extracts key insights from the transcript, and a Voice Speaker agent synthesises the analysis into spoken audio using Deepgram aura-2. The agents coordinate sequentially, passing context from one to the next. + +## Prerequisites + +- Python 3.10+ +- Deepgram account — [get a free API key](https://console.deepgram.com/) +- OpenAI account — [get an API key](https://platform.openai.com/api-keys) + +## Environment variables + +| Variable | Where to find it | +|----------|-----------------| +| `DEEPGRAM_API_KEY` | [Deepgram console](https://console.deepgram.com/) | +| `OPENAI_API_KEY` | [OpenAI dashboard](https://platform.openai.com/api-keys) | + +Copy `.env.example` to `.env` and fill in your values. + +## Install and run + +```bash +pip install -r requirements.txt + +# Run with default sample audio (NASA spacewalk recording) +python src/crew.py + +# Run with your own audio file or URL +python src/crew.py path/to/audio.wav +python src/crew.py https://example.com/audio.mp3 +``` + +## Key parameters + +| Parameter | Value | Description | +|-----------|-------|-------------| +| `model` (STT) | `nova-3` | Deepgram's most accurate speech recognition model | +| `model` (TTS) | `aura-2-asteria-en` | Natural conversational voice for spoken output | +| `smart_format` | `True` | Adds punctuation, capitalisation, and number formatting | +| `process` | `Process.sequential` | Agents execute in order — listener, researcher, speaker | + +## How it works + +1. **Voice Listener agent** receives an audio source (URL or file path) and calls the `transcribe_audio` tool, which uses the Deepgram Python SDK to transcribe with nova-3 +2. **Research Analyst agent** receives the transcript and produces a concise, voice-friendly summary of the key points and insights +3. **Voice Speaker agent** takes the summary, cleans it for natural speech, and calls the `speak_text` tool to generate a WAV file via Deepgram TTS (aura-2) +4. The crew runs sequentially via `Process.sequential` — each task's output is automatically passed as context to the next agent + +## Architecture + +``` +Audio Input + | + v +[Voice Listener Agent] + | transcribe_audio tool -> Deepgram STT (nova-3) + v +[Research Analyst Agent] + | LLM analysis (GPT-4.1-mini) + v +[Voice Speaker Agent] + | speak_text tool -> Deepgram TTS (aura-2) + v +Audio Output (WAV file) +``` + +## Related + +- [CrewAI documentation](https://docs.crewai.com/) +- [CrewAI GitHub](https://github.com/crewAIInc/crewAI) +- [Deepgram STT docs](https://developers.deepgram.com/docs/streaming) +- [Deepgram TTS docs](https://developers.deepgram.com/docs/text-to-speech) +- [Deepgram Python SDK](https://github.com/deepgram/deepgram-python-sdk) + +## Starter templates + +If you want a ready-to-run base for your own project, check the [deepgram-starters](https://github.com/orgs/deepgram-starters/repositories) org — there are starter repos for every language and every Deepgram product. diff --git a/examples/310-crewai-voice-agents-python/requirements.txt b/examples/310-crewai-voice-agents-python/requirements.txt new file mode 100644 index 0000000..e4774bf --- /dev/null +++ b/examples/310-crewai-voice-agents-python/requirements.txt @@ -0,0 +1,3 @@ +crewai[tools]>=1.12.0 +deepgram-sdk==v6.1.1 +python-dotenv>=1.0.0 diff --git a/examples/310-crewai-voice-agents-python/src/crew.py b/examples/310-crewai-voice-agents-python/src/crew.py new file mode 100644 index 0000000..2011a0c --- /dev/null +++ b/examples/310-crewai-voice-agents-python/src/crew.py @@ -0,0 +1,230 @@ +"""CrewAI multi-agent voice crew with Deepgram STT & TTS. + +A three-agent crew that demonstrates voice-in/voice-out multi-agent +coordination: Deepgram transcribes spoken audio, CrewAI agents process +and research the request, and Deepgram synthesises a spoken response. + +Pipeline: + audio file -> Deepgram STT (nova-3) -> CrewAI crew -> Deepgram TTS (aura-2) -> audio file + +Usage: + python src/crew.py # uses default sample audio + python src/crew.py path/to/audio.wav # transcribe your own file +""" + +import os +import sys +import tempfile +from pathlib import Path + +from dotenv import load_dotenv + +load_dotenv() + +if not os.environ.get("DEEPGRAM_API_KEY"): + print("Error: DEEPGRAM_API_KEY is not set.", file=sys.stderr) + print("Get a free key at https://console.deepgram.com/", file=sys.stderr) + sys.exit(1) + +if not os.environ.get("OPENAI_API_KEY"): + print("Error: OPENAI_API_KEY is not set.", file=sys.stderr) + print("Get a key at https://platform.openai.com/api-keys", file=sys.stderr) + sys.exit(1) + +from crewai import Agent, Crew, Process, Task +from crewai.tools import tool +from deepgram import DeepgramClient + +AUDIO_URL = "https://dpgr.am/spacewalk.wav" + + +def get_deepgram_client() -> DeepgramClient: + return DeepgramClient() + + +# ── Deepgram STT tool ──────────────────────────────────────────────────────── + +@tool +def transcribe_audio(audio_source: str) -> str: + """Transcribe an audio file or URL using Deepgram speech-to-text. + + Accepts a local file path or a public URL. Returns the full transcript. + """ + client = get_deepgram_client() + + if audio_source.startswith(("http://", "https://")): + # Deepgram fetches the URL server-side — no local download needed + response = client.listen.v1.media.transcribe_url( + url=audio_source, + model="nova-3", + smart_format=True, + tag="deepgram-examples", + ) + else: + audio_bytes = Path(audio_source).read_bytes() + response = client.listen.v1.media.transcribe_file( + audio_bytes, + model="nova-3", + smart_format=True, + tag="deepgram-examples", + ) + + # response.results.channels[0].alternatives[0].transcript + transcript = response.results.channels[0].alternatives[0].transcript + confidence = response.results.channels[0].alternatives[0].confidence + + return ( + f"Transcript (confidence {confidence:.0%}):\n\n{transcript}" + ) + + +# ── Deepgram TTS tool ──────────────────────────────────────────────────────── + +@tool +def speak_text(text: str) -> str: + """Convert text to speech using Deepgram TTS and save as a WAV file. + + Returns the path to the generated audio file. + """ + client = get_deepgram_client() + + # aura-2-asteria-en is a natural conversational voice. + # Other options: aura-2-zeus-en, aura-2-orpheus-en, aura-2-luna-en + # Full list: https://developers.deepgram.com/docs/tts-models + output_path = os.path.join(tempfile.gettempdir(), "crewai_voice_output.wav") + audio_iter = client.speak.v1.audio.generate( + text=text, + model="aura-2-asteria-en", + encoding="linear16", + sample_rate=24000, + container="wav", + tag="deepgram-examples", + ) + + with open(output_path, "wb") as f: + for chunk in audio_iter: + f.write(chunk) + + return f"Audio response saved to: {output_path}" + + +# ── CrewAI agents ──────────────────────────────────────────────────────────── + +def create_listener_agent() -> Agent: + """Agent that handles voice input via Deepgram STT.""" + return Agent( + role="Voice Listener", + goal="Accurately transcribe spoken audio into text using Deepgram", + backstory=( + "You are a specialist in audio transcription. Your job is to " + "take audio input and produce clean, accurate text using the " + "Deepgram speech-to-text tool. Always use the transcribe_audio " + "tool to process audio — never guess the content." + ), + tools=[transcribe_audio], + verbose=True, + ) + + +def create_researcher_agent() -> Agent: + """Agent that analyses the transcript and produces a response.""" + return Agent( + role="Research Analyst", + goal="Analyse transcribed speech and produce a clear, concise summary with key insights", + backstory=( + "You are an expert analyst who takes transcribed text and " + "extracts the key points, themes, and actionable insights. " + "Your summaries are concise, structured, and suitable to be " + "read aloud. Keep your response under 100 words so it works " + "well as spoken audio." + ), + verbose=True, + ) + + +def create_speaker_agent() -> Agent: + """Agent that delivers the final response as spoken audio via Deepgram TTS.""" + return Agent( + role="Voice Speaker", + goal="Convert the research analysis into natural spoken audio using Deepgram TTS", + backstory=( + "You are a presentation specialist. Take the analysis from " + "the researcher and convert it into natural, spoken audio. " + "Before calling the speak_text tool, clean the text so it " + "sounds natural when spoken — remove markdown, bullet points, " + "and special formatting. Use the speak_text tool with the " + "cleaned text." + ), + tools=[speak_text], + verbose=True, + ) + + +# ── CrewAI tasks and crew ──────────────────────────────────────────────────── + +def build_crew(audio_source: str) -> Crew: + """Assemble a sequential crew: listen -> research -> speak.""" + listener = create_listener_agent() + researcher = create_researcher_agent() + speaker = create_speaker_agent() + + listen_task = Task( + description=( + f"Transcribe the audio from: {audio_source}\n" + "Use the transcribe_audio tool with this source. " + "Return the full transcript text." + ), + expected_output="The complete transcript of the audio.", + agent=listener, + ) + + research_task = Task( + description=( + "Analyse the transcript from the previous task. " + "Identify the main topic, key points, and any notable details. " + "Write a concise summary (under 100 words) that would sound " + "natural when read aloud." + ), + expected_output="A concise spoken-friendly summary of the transcript content.", + agent=researcher, + ) + + speak_task = Task( + description=( + "Take the research summary and convert it to spoken audio. " + "First, clean up the text to remove any markdown formatting, " + "bullet points, or special characters. Then use the speak_text " + "tool to generate the audio file. Return the file path." + ), + expected_output="The file path to the generated audio response.", + agent=speaker, + ) + + return Crew( + agents=[listener, researcher, speaker], + tasks=[listen_task, research_task, speak_task], + # Sequential: each task's output flows to the next agent + process=Process.sequential, + verbose=True, + ) + + +def main(): + audio_source = sys.argv[1] if len(sys.argv) > 1 else AUDIO_URL + + print(f"Audio source: {audio_source}") + print("Building CrewAI voice crew...") + print(" Agent 1: Voice Listener (Deepgram STT)") + print(" Agent 2: Research Analyst (LLM)") + print(" Agent 3: Voice Speaker (Deepgram TTS)") + print() + + crew = build_crew(audio_source) + result = crew.kickoff() + + print("\n── Crew result ──") + print(result) + + +if __name__ == "__main__": + main() diff --git a/examples/310-crewai-voice-agents-python/tests/test_example.py b/examples/310-crewai-voice-agents-python/tests/test_example.py new file mode 100644 index 0000000..fe5e8cc --- /dev/null +++ b/examples/310-crewai-voice-agents-python/tests/test_example.py @@ -0,0 +1,112 @@ +import os +import sys +from pathlib import Path + +# ── Credential check ──────────────────────────────────────────────────────── +# Exit code convention across all examples in this repo: +# 0 = all tests passed +# 1 = real test failure (code bug, assertion error, unexpected API response) +# 2 = missing credentials (expected in CI until secrets are configured) +env_example = Path(__file__).parent.parent / ".env.example" +required = [ + line.split("=")[0].strip() + for line in env_example.read_text().splitlines() + if line and not line.startswith("#") and "=" in line and line[0].isupper() +] +missing = [k for k in required if not os.environ.get(k)] +if missing: + print(f"MISSING_CREDENTIALS: {','.join(missing)}", file=sys.stderr) + sys.exit(2) +# ──────────────────────────────────────────────────────────────────────────── + +from deepgram import DeepgramClient + +AUDIO_URL = "https://dpgr.am/spacewalk.wav" + + +def test_deepgram_stt(): + """Verify Deepgram STT transcribes audio correctly with nova-3.""" + client = DeepgramClient() + response = client.listen.v1.media.transcribe_url( + url=AUDIO_URL, + model="nova-3", + smart_format=True, + tag="deepgram-examples", + ) + transcript = response.results.channels[0].alternatives[0].transcript + assert len(transcript) > 10, "Transcript too short" + + duration = response.results.channels[0].alternatives[0].words[-1].end if response.results.channels[0].alternatives[0].words else 0 + chars_per_sec = len(transcript) / max(duration, 1) + assert 1 < chars_per_sec < 100, f"Transcript length not proportional to duration: {len(transcript)} chars / {duration:.1f}s" + + print("pass: Deepgram STT integration working") + print(f" Transcript preview: '{transcript[:80]}...'") + + +def test_deepgram_tts(): + """Verify Deepgram TTS generates audio bytes.""" + client = DeepgramClient() + import tempfile + + output_path = os.path.join(tempfile.gettempdir(), "test_tts_output.wav") + audio_iter = client.speak.v1.audio.generate( + text="Hello, this is a test of Deepgram text to speech.", + model="aura-2-asteria-en", + encoding="linear16", + sample_rate=24000, + container="wav", + tag="deepgram-examples", + ) + + with open(output_path, "wb") as f: + for chunk in audio_iter: + f.write(chunk) + + assert os.path.exists(output_path), "TTS output file was not created" + file_size = os.path.getsize(output_path) + assert file_size > 1000, f"TTS output too small ({file_size} bytes)" + + os.unlink(output_path) + print("pass: Deepgram TTS integration working") + print(f" Generated {file_size} bytes of audio") + + +def test_crewai_tools_importable(): + """Verify the CrewAI tools from our source are importable and well-formed.""" + sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + from crew import transcribe_audio, speak_text + + assert transcribe_audio.name == "transcribe_audio" + assert "transcribe" in transcribe_audio.description.lower() + + assert speak_text.name == "speak_text" + assert "speech" in speak_text.description.lower() or "text" in speak_text.description.lower() + + print("pass: CrewAI tool definitions valid") + print(f" Tools: {transcribe_audio.name}, {speak_text.name}") + + +def test_crewai_crew_builds(): + """Verify the crew can be assembled without errors.""" + sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + from crew import build_crew + + crew = build_crew(AUDIO_URL) + assert len(crew.agents) == 3, f"Expected 3 agents, got {len(crew.agents)}" + assert len(crew.tasks) == 3, f"Expected 3 tasks, got {len(crew.tasks)}" + + roles = [a.role for a in crew.agents] + assert "Voice Listener" in roles + assert "Research Analyst" in roles + assert "Voice Speaker" in roles + + print("pass: CrewAI crew builds correctly") + print(f" Agents: {', '.join(roles)}") + + +if __name__ == "__main__": + test_deepgram_stt() + test_deepgram_tts() + test_crewai_tools_importable() + test_crewai_crew_builds()