From b2cd9d32430c10b0b37e553f9b6db8e63b06c4a7 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 12:20:11 +0000 Subject: [PATCH 01/18] feat: add deepgram tts extension with voice-assistant integration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit add Deepgram TTS extension using WebSocket streaming API with Aura-2 voices. Wire into voice-assistant example as voice_assistant_deepgram_tts graph variant. Include progressive disclosure AI documentation. addresses PR #2128 review feedback: - remove raw config logging that exposed API keys - extract _finalize_request() helper (consolidate 5 duplicate patterns) - await client.start() instead of fire-and-forget asyncio.create_task() - add _reconnect_client() for immediate reconnect after errors - consume EVENT_TTS_FLUSH internally (don't leak to caller) - add early text validation in get() for empty/whitespace text - reduce websocket recv timeout from 10s to 5s - drop audio chunks received after cancellation flag is set - reconnect websocket after cancel for clean state on next request - change manifest.json sample_rate type from int64 to int32 test results: - standalone: 13/13 passed - guarder: 14/16 passed - test_flush: PASS (was failing — fixed cancel race condition) - test_invalid_text_handling: PASS (was skipped — fixed with text validation + timeout reduction) - test_interleaved_requests: FAIL — websocket state from previous request causes timeout on request 8/8. needs duplex websocket architecture (separate send/receive tasks) to fully resolve. - test_subtitle_alignment: FAIL — feature gap, deepgram tts api does not provide word-level timing data. config file not present. --- AGENTS.md | 26 + CLAUDE.md | 1 + .../voice-assistant/tenapp/manifest.json | 3 + .../voice-assistant/tenapp/property.json | 184 +++++ .../extension/deepgram_tts/README.md | 97 +++ .../extension/deepgram_tts/__init__.py | 6 + .../extension/deepgram_tts/addon.py | 20 + .../extension/deepgram_tts/config.py | 75 ++ .../extension/deepgram_tts/deepgram_tts.py | 298 ++++++++ .../extension/deepgram_tts/extension.py | 489 +++++++++++++ .../extension/deepgram_tts/manifest.json | 65 ++ .../extension/deepgram_tts/property.json | 11 + .../extension/deepgram_tts/requirements.txt | 1 + .../extension/deepgram_tts/tests/__init__.py | 5 + .../extension/deepgram_tts/tests/bin/start | 21 + .../property_basic_audio_setting1.json | 10 + .../property_basic_audio_setting2.json | 10 + .../tests/configs/property_dump.json | 10 + .../tests/configs/property_invalid.json | 5 + .../tests/configs/property_miss_required.json | 5 + .../extension/deepgram_tts/tests/conftest.py | 99 +++ .../deepgram_tts/tests/test_basic.py | 325 +++++++++ .../deepgram_tts/tests/test_error_msg.py | 174 +++++ .../deepgram_tts/tests/test_metrics.py | 135 ++++ .../deepgram_tts/tests/test_params.py | 157 +++++ .../deepgram_tts/tests/test_robustness.py | 277 ++++++++ docs/ai/L0_repo_card.md | 31 + docs/ai/L1/01_setup.md | 118 ++++ docs/ai/L1/02_architecture.md | 142 ++++ docs/ai/L1/03_code_map.md | 117 ++++ docs/ai/L1/04_conventions.md | 138 ++++ docs/ai/L1/05_workflows.md | 181 +++++ docs/ai/L1/06_interfaces.md | 150 ++++ docs/ai/L1/07_gotchas.md | 235 +++++++ docs/ai/L1/08_security.md | 88 +++ docs/ai/L1/deep_dives/_index.md | 9 + docs/ai/L1/deep_dives/deployment.md | 206 ++++++ .../ai/L1/deep_dives/extension_development.md | 653 ++++++++++++++++++ docs/ai/L1/deep_dives/graph_configuration.md | 410 +++++++++++ docs/ai/L1/deep_dives/server_architecture.md | 211 ++++++ docs/ai/L1/deep_dives/testing.md | 295 ++++++++ 41 files changed, 5493 insertions(+) create mode 100644 AGENTS.md create mode 100644 CLAUDE.md create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/README.md create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/__init__.py create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/addon.py create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/config.py create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/manifest.json create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/property.json create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/requirements.txt create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/tests/__init__.py create mode 100755 ai_agents/agents/ten_packages/extension/deepgram_tts/tests/bin/start create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_basic_audio_setting1.json create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_basic_audio_setting2.json create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_dump.json create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_invalid.json create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_miss_required.json create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/tests/conftest.py create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_error_msg.py create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_metrics.py create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_params.py create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_robustness.py create mode 100644 docs/ai/L0_repo_card.md create mode 100644 docs/ai/L1/01_setup.md create mode 100644 docs/ai/L1/02_architecture.md create mode 100644 docs/ai/L1/03_code_map.md create mode 100644 docs/ai/L1/04_conventions.md create mode 100644 docs/ai/L1/05_workflows.md create mode 100644 docs/ai/L1/06_interfaces.md create mode 100644 docs/ai/L1/07_gotchas.md create mode 100644 docs/ai/L1/08_security.md create mode 100644 docs/ai/L1/deep_dives/_index.md create mode 100644 docs/ai/L1/deep_dives/deployment.md create mode 100644 docs/ai/L1/deep_dives/extension_development.md create mode 100644 docs/ai/L1/deep_dives/graph_configuration.md create mode 100644 docs/ai/L1/deep_dives/server_architecture.md create mode 100644 docs/ai/L1/deep_dives/testing.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000000..d23c0aa719 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,26 @@ +# AI Agent Instructions + +This repository uses progressive disclosure documentation to help AI coding +agents work efficiently. Documentation is structured in three levels under +`docs/ai/`. + +## How to Load + +1. Read [docs/ai/L0_repo_card.md](docs/ai/L0_repo_card.md) to identify the repo. +2. Load ALL 8 files in `docs/ai/L1/`. They are small — load all of them upfront. + This gives you setup, architecture, code map, conventions, workflows, + interfaces, gotchas, and security. +3. If a task needs more detail than L1 provides, follow links to L2 deep dives + in `docs/ai/L1/deep_dives/`. Load only the specific L2 file you need. + +## Levels + +- **L0 (Repo Card):** Identity and L1 index. Table of contents. +- **L1 (Summaries):** Eight structured summaries. Load all at session start. +- **L2 (Deep Dives):** Full specifications. Load only when L1 isn't detailed enough. + +## Working Areas + +- **AI Agents development**: `ai_agents/` — see `ai_agents/AGENTS.md` for workspace-specific context +- **Core framework**: `core/`, `packages/`, `build/` +- **Operational reference**: `ai/AI_working_with_ten.md` (full), `ai/AI_working_with_ten_compact.md` (quick) diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000000..c2c4fb4158 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +Read @AGENTS.md for AI agent instructions and progressive disclosure docs. diff --git a/ai_agents/agents/examples/voice-assistant/tenapp/manifest.json b/ai_agents/agents/examples/voice-assistant/tenapp/manifest.json index 020768c826..97f3d5c3a2 100644 --- a/ai_agents/agents/examples/voice-assistant/tenapp/manifest.json +++ b/ai_agents/agents/examples/voice-assistant/tenapp/manifest.json @@ -152,6 +152,9 @@ }, { "path": "../../../ten_packages/extension/oracle_tts_python" + }, + { + "path": "../../../ten_packages/extension/deepgram_tts" } ], "scripts": { diff --git a/ai_agents/agents/examples/voice-assistant/tenapp/property.json b/ai_agents/agents/examples/voice-assistant/tenapp/property.json index 270bfb77be..dcd0b8e214 100644 --- a/ai_agents/agents/examples/voice-assistant/tenapp/property.json +++ b/ai_agents/agents/examples/voice-assistant/tenapp/property.json @@ -185,6 +185,190 @@ ] } }, + { + "name": "voice_assistant_deepgram_tts", + "auto_start": false, + "graph": { + "nodes": [ + { + "type": "extension", + "name": "agora_rtc", + "addon": "agora_rtc", + "extension_group": "default", + "property": { + "app_id": "${env:AGORA_APP_ID}", + "app_certificate": "${env:AGORA_APP_CERTIFICATE|}", + "channel": "ten_agent_test", + "stream_id": 1234, + "remote_stream_id": 123, + "subscribe_audio": true, + "publish_audio": true, + "publish_data": true, + "enable_agora_asr": false + } + }, + { + "type": "extension", + "name": "stt", + "addon": "deepgram_asr_python", + "extension_group": "stt", + "property": { + "params": { + "api_key": "${env:DEEPGRAM_API_KEY}", + "language": "en-US", + "model": "nova-3" + } + } + }, + { + "type": "extension", + "name": "llm", + "addon": "openai_llm2_python", + "extension_group": "chatgpt", + "property": { + "base_url": "https://api.openai.com/v1", + "api_key": "${env:OPENAI_API_KEY}", + "frequency_penalty": 0.9, + "model": "${env:OPENAI_MODEL}", + "max_tokens": 512, + "prompt": "", + "proxy_url": "${env:OPENAI_PROXY_URL|}", + "greeting": "TEN Agent connected. How can I help you today?", + "max_memory_length": 10 + } + }, + { + "type": "extension", + "name": "tts", + "addon": "deepgram_tts", + "extension_group": "tts", + "property": { + "dump": false, + "dump_path": "/tmp", + "params": { + "api_key": "${env:DEEPGRAM_API_KEY}", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000 + } + } + }, + { + "type": "extension", + "name": "main_control", + "addon": "main_python", + "extension_group": "control", + "property": { + "greeting": "TEN Agent connected. How can I help you today?" + } + }, + { + "type": "extension", + "name": "message_collector", + "addon": "message_collector2", + "extension_group": "transcriber", + "property": {} + }, + { + "type": "extension", + "name": "weatherapi_tool_python", + "addon": "weatherapi_tool_python", + "extension_group": "default", + "property": { + "api_key": "${env:WEATHERAPI_API_KEY|}" + } + }, + { + "type": "extension", + "name": "streamid_adapter", + "addon": "streamid_adapter", + "property": {} + } + ], + "connections": [ + { + "extension": "main_control", + "cmd": [ + { + "names": [ + "on_user_joined", + "on_user_left" + ], + "source": [ + { + "extension": "agora_rtc" + } + ] + }, + { + "names": [ + "tool_register" + ], + "source": [ + { + "extension": "weatherapi_tool_python" + } + ] + } + ], + "data": [ + { + "name": "asr_result", + "source": [ + { + "extension": "stt" + } + ] + } + ] + }, + { + "extension": "agora_rtc", + "audio_frame": [ + { + "name": "pcm_frame", + "dest": [ + { + "extension": "streamid_adapter" + } + ] + }, + { + "name": "pcm_frame", + "source": [ + { + "extension": "tts" + } + ] + } + ], + "data": [ + { + "name": "data", + "source": [ + { + "extension": "message_collector" + } + ] + } + ] + }, + { + "extension": "streamid_adapter", + "audio_frame": [ + { + "name": "pcm_frame", + "dest": [ + { + "extension": "stt" + } + ] + } + ] + } + ] + } + }, { "name": "voice_assistant_oracle", "auto_start": false, diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/README.md b/ai_agents/agents/ten_packages/extension/deepgram_tts/README.md new file mode 100644 index 0000000000..c8be961b39 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/README.md @@ -0,0 +1,97 @@ +# Deepgram TTS Extension + +A TEN Framework extension that provides Text-to-Speech (TTS) capabilities using Deepgram's Aura streaming API. + +## Features + +- Real-time streaming TTS via WebSocket +- Multiple voice models (Aura-2 series) +- Configurable sample rates (8000, 16000, 24000, 48000 Hz) +- Linear16 PCM audio output +- TTFB (Time to First Byte) metrics reporting +- Audio dump capability for debugging + +## Configuration + +### Properties + +| Property | Type | Default | Description | +|----------|------|---------|-------------| +| `params.api_key` | string | Required | Deepgram API key | +| `params.model` | string | `aura-2-thalia-en` | Voice model to use | +| `params.encoding` | string | `linear16` | Audio encoding format | +| `params.sample_rate` | int | `24000` | Output sample rate in Hz | +| `params.base_url` | string | `wss://api.deepgram.com/v1/speak` | WebSocket endpoint | +| `dump` | bool | `false` | Enable audio dumping | +| `dump_path` | string | `/tmp` | Path for audio dump files | + +### Example Configuration + +```json +{ + "params": { + "api_key": "${env:DEEPGRAM_API_KEY}", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000 + }, + "dump": false, + "dump_path": "/tmp" +} +``` + +## Available Voice Models + +Deepgram Aura-2 voices: +- `aura-2-thalia-en` - Female, English (default) +- `aura-2-luna-en` - Female, English +- `aura-2-stella-en` - Female, English +- `aura-2-athena-en` - Female, English +- `aura-2-hera-en` - Female, English +- `aura-2-orion-en` - Male, English +- `aura-2-arcas-en` - Male, English +- `aura-2-perseus-en` - Male, English +- `aura-2-angus-en` - Male, English +- `aura-2-orpheus-en` - Male, English +- `aura-2-helios-en` - Male, English +- `aura-2-zeus-en` - Male, English + +## Supported Sample Rates + +- 8000 Hz +- 16000 Hz +- 24000 Hz (recommended) +- 48000 Hz + +## API Interface + +This extension implements the standard TEN TTS interface: + +### Input Data +- `tts_text_input` - Text to synthesize +- `tts_flush` - Flush pending audio + +### Output Data +- `tts_audio_start` - Audio generation started +- `tts_audio_end` - Audio generation completed +- `metrics` - Performance metrics (TTFB, duration) +- `error` - Error information + +### Output Audio +- `pcm_frame` - PCM audio data (16-bit, mono) + +## Running Tests + +```bash +cd deepgram_tts +tman -y install --standalone +./tests/bin/start +``` + +## Environment Variables + +- `DEEPGRAM_API_KEY` - Your Deepgram API key + +## License + +Apache License, Version 2.0 diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/__init__.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/__init__.py new file mode 100644 index 0000000000..72593ab225 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/__init__.py @@ -0,0 +1,6 @@ +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +from . import addon diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/addon.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/addon.py new file mode 100644 index 0000000000..477d15e16d --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/addon.py @@ -0,0 +1,20 @@ +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +from ten_runtime import ( + Addon, + register_addon_as_extension, + TenEnv, +) + + +@register_addon_as_extension("deepgram_tts") +class DeepgramTTSExtensionAddon(Addon): + + def on_create_instance(self, ten_env: TenEnv, name: str, context) -> None: + from .extension import DeepgramTTSExtension + + ten_env.log_info("DeepgramTTSExtensionAddon on_create_instance") + ten_env.on_create_instance_done(DeepgramTTSExtension(name), context) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/config.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/config.py new file mode 100644 index 0000000000..901b2eb449 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/config.py @@ -0,0 +1,75 @@ +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +from __future__ import annotations + +from typing import Any +import copy + +from ten_ai_base import utils + +from pydantic import BaseModel, Field + + +class DeepgramTTSConfig(BaseModel): + api_key: str = "" + base_url: str = "wss://api.deepgram.com/v1/speak" + + model: str = "aura-2-thalia-en" + encoding: str = "linear16" + sample_rate: int = 24000 + + dump: bool = False + dump_path: str = "/tmp" + params: dict[str, Any] = Field(default_factory=dict) + + def update_params(self) -> None: + params = self._ensure_dict(self.params) + self.params = params + + if "api_key" in params: + self.api_key = params["api_key"] + del params["api_key"] + + if "base_url" in params: + self.base_url = params["base_url"] + del params["base_url"] + + if "model" in params: + self.model = params["model"] + del params["model"] + + if "encoding" in params: + self.encoding = params["encoding"] + del params["encoding"] + + if "sample_rate" in params: + self.sample_rate = params["sample_rate"] + del params["sample_rate"] + + def to_str(self, sensitive_handling: bool = True) -> str: + """ + Convert the configuration to a string representation. + """ + if not sensitive_handling: + return f"{self}" + + config = copy.deepcopy(self) + + # Encrypt sensitive fields + if config.api_key: + config.api_key = utils.encrypt(config.api_key) + if config.params and "api_key" in config.params: + config.params["api_key"] = utils.encrypt(config.params["api_key"]) + + return f"{config}" + + @staticmethod + def _ensure_dict(value: Any) -> dict[str, Any]: + if isinstance(value, dict): + return value + if value is None: + return {} + return dict(value) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py new file mode 100644 index 0000000000..1564afcc2d --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py @@ -0,0 +1,298 @@ +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +import asyncio +import json +from collections.abc import Callable +from datetime import datetime +from typing import AsyncIterator + +import websockets +from websockets.asyncio.client import ClientConnection + +from .config import DeepgramTTSConfig +from ten_runtime import AsyncTenEnv +from ten_ai_base.const import LOG_CATEGORY_VENDOR + +# Custom event types to communicate status back to the extension +EVENT_TTS_RESPONSE = 1 +EVENT_TTS_END = 2 +EVENT_TTS_ERROR = 3 +EVENT_TTS_FLUSH = 4 +EVENT_TTS_TTFB_METRIC = 5 + + +class DeepgramTTSConnectionException(Exception): + """Exception raised when Deepgram TTS connection fails""" + + def __init__(self, status_code: int, body: str): + self.status_code = status_code + self.body = body + super().__init__( + f"Deepgram TTS connection failed (code: {status_code}): {body}" + ) + + +class DeepgramTTSClient: + def __init__( + self, + config: DeepgramTTSConfig, + ten_env: AsyncTenEnv, + send_fatal_tts_error: Callable[[str], asyncio.Future] | None = None, + send_non_fatal_tts_error: Callable[[str], asyncio.Future] | None = None, + ): + self.config = config + self.ten_env: AsyncTenEnv = ten_env + self._is_cancelled = False + self.ws: ClientConnection | None = None + self.send_fatal_tts_error = send_fatal_tts_error + self.send_non_fatal_tts_error = send_non_fatal_tts_error + + self.sent_ts: datetime | None = None + self.ttfb_sent: bool = False + + # Build WebSocket URL with query parameters + self.ws_url = self._build_ws_url() + + def _build_ws_url(self) -> str: + """Build the WebSocket URL with query parameters""" + base = self.config.base_url + params = f"model={self.config.model}&encoding={self.config.encoding}&sample_rate={self.config.sample_rate}" + return f"{base}?{params}" + + async def start(self) -> None: + """Preheating: establish websocket connection during initialization""" + try: + await self._connect() + + except Exception as e: + self.ten_env.log_error(f"Deepgram TTS preheat failed: {e}") + + async def _connect(self) -> None: + """Connect to the websocket""" + try: + extra_headers = { + "Authorization": f"Token {self.config.api_key}", + } + self.ws = await websockets.connect( + self.ws_url, + additional_headers=extra_headers, + ) + self.ten_env.log_debug( + "vendor_status: connected to deepgram tts", + category=LOG_CATEGORY_VENDOR, + ) + + except Exception as e: + error_message = str(e) + if "401" in error_message or "Unauthorized" in error_message: + if self.send_fatal_tts_error: + await self.send_fatal_tts_error(error_message=error_message) + else: + raise DeepgramTTSConnectionException( + status_code=401, body=error_message + ) from e + else: + self.ten_env.log_error( + f"Deepgram TTS preheat failed, unexpected error: {e}" + ) + if self.send_non_fatal_tts_error: + await self.send_non_fatal_tts_error( + error_message=error_message + ) + raise + + async def stop(self): + # Set cancellation flag first to stop any pending operations + self._is_cancelled = True + + # Stop the websocket connection if it exists + if self.ws: + try: + # Send close message + await self.ws.send(json.dumps({"type": "Close"})) + except Exception: + pass + await self.ws.close() + self.ws = None + + async def cancel(self): + """ + Cancel the current TTS task. + """ + self.ten_env.log_debug("Cancelling current TTS task.") + self._is_cancelled = True + if self.ws: + self.reset_ttfb() + # Send flush to clear any pending audio + try: + await self.ws.send(json.dumps({"type": "Flush"})) + except Exception: + pass + + async def reconnect(self): + """Close and re-establish the websocket connection.""" + if self.ws: + try: + await self.ws.close() + except Exception: + pass + self.ws = None + await self._connect() + + def reset_ttfb(self): + self.sent_ts = None + self.ttfb_sent = False + + async def get( + self, text: str + ) -> AsyncIterator[tuple[bytes | int | None, int | None]]: + """Generate TTS audio for the given text""" + + if len(text.strip()) == 0: + self.ten_env.log_warn( + "DeepgramTTS: empty text provided, " "returning END event" + ) + yield None, EVENT_TTS_END + return + + self._is_cancelled = False + try: + await self._ensure_connection() + async for audio_chunk, event_status in self._process_single_tts( + text + ): + if event_status == EVENT_TTS_FLUSH: + # Cancelled: reconnect for clean state + await self.reconnect() + break + + yield audio_chunk, event_status + + except Exception as e: + self.ten_env.log_error( + f"vendor_error: {e}", + category=LOG_CATEGORY_VENDOR, + ) + raise + + async def _ensure_connection(self) -> None: + """Ensure websocket connection is established""" + if not self.ws: + await self._connect() + + async def _process_single_tts( + self, text: str + ) -> AsyncIterator[tuple[bytes | int | None, int | None]]: + """Process a single TTS request""" + if not self.ws: + self.ten_env.log_error("Deepgram websocket not connected") + return + + self.ten_env.log_debug(f"process_single_tts, text: {text}") + + if not self.ttfb_sent: + self.sent_ts = datetime.now() + + # Send the text to Deepgram + speak_msg = { + "type": "Speak", + "text": text, + } + await self.ws.send(json.dumps(speak_msg)) + + # Send flush to get audio immediately + await self.ws.send(json.dumps({"type": "Flush"})) + + try: + # Receive audio data + while True: + if self._is_cancelled: + self.ten_env.log_debug( + "Cancellation flag detected, stopping TTS stream." + ) + yield None, EVENT_TTS_FLUSH + break + + try: + message = await asyncio.wait_for( + self.ws.recv(), timeout=5.0 + ) + except asyncio.TimeoutError: + self.ten_env.log_error( + "Timeout waiting for Deepgram audio - yielding error" + ) + yield b"Timeout waiting for Deepgram audio", EVENT_TTS_ERROR + break + + # Binary message = audio data + if isinstance(message, bytes): + # Drop audio if cancelled during recv + if self._is_cancelled: + self.ten_env.log_debug( + "Cancellation detected after recv, " + "dropping audio chunk." + ) + yield None, EVENT_TTS_FLUSH + break + + # First audio chunk, calculate TTFB + if self.sent_ts and not self.ttfb_sent: + ttfb_ms = int( + (datetime.now() - self.sent_ts).total_seconds() + * 1000 + ) + yield ttfb_ms, EVENT_TTS_TTFB_METRIC + self.ttfb_sent = True + + self.ten_env.log_debug( + f"DeepgramTTS: sending EVENT_TTS_RESPONSE, " + f"length: {len(message)}" + ) + yield message, EVENT_TTS_RESPONSE + + # Text message = JSON metadata + else: + try: + data = json.loads(message) + msg_type = data.get("type", "") + + if msg_type == "Flushed": + # All audio for this text has been sent + self.ten_env.log_debug( + "DeepgramTTS: received Flushed, " + "sending EVENT_TTS_END" + ) + yield None, EVENT_TTS_END + break + + elif msg_type == "Warning": + self.ten_env.log_warn( + f"Deepgram warning: {data.get('warn_msg', '')}" + ) + + elif msg_type == "Error": + error_msg = data.get("err_msg", "Unknown error") + self.ten_env.log_error( + f"Deepgram error: {error_msg}" + ) + yield error_msg.encode("utf-8"), EVENT_TTS_ERROR + break + + except json.JSONDecodeError: + self.ten_env.log_warn( + f"Failed to parse Deepgram message: {message}" + ) + + if not self._is_cancelled: + self.ten_env.log_debug("DeepgramTTS: TTS complete") + + except Exception as e: + error_message = str(e) + self.ten_env.log_error( + f"vendor_error: {error_message}", + category=LOG_CATEGORY_VENDOR, + ) + yield error_message.encode("utf-8"), EVENT_TTS_ERROR diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py new file mode 100644 index 0000000000..749f70f4eb --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py @@ -0,0 +1,489 @@ +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +import asyncio +from datetime import datetime +import os +import traceback + +from ten_ai_base.helper import PCMWriter +from ten_ai_base.message import ( + ModuleError, + ModuleErrorCode, + ModuleType, + ModuleErrorVendorInfo, + TTSAudioEndReason, +) +from ten_ai_base.struct import TTSTextInput +from ten_ai_base.tts2 import AsyncTTS2BaseExtension +from ten_ai_base.const import LOG_CATEGORY_VENDOR, LOG_CATEGORY_KEY_POINT +from .config import DeepgramTTSConfig + +from .deepgram_tts import ( + EVENT_TTS_END, + EVENT_TTS_RESPONSE, + EVENT_TTS_TTFB_METRIC, + EVENT_TTS_ERROR, + DeepgramTTSClient, + DeepgramTTSConnectionException, +) +from ten_runtime import AsyncTenEnv + + +class DeepgramTTSExtension(AsyncTTS2BaseExtension): + def __init__(self, name: str) -> None: + super().__init__(name) + self.config: DeepgramTTSConfig | None = None + self.client: DeepgramTTSClient | None = None + self.current_request_id: str | None = None + self.current_turn_id: int = -1 + self.sent_ts: datetime | None = None + self.current_request_finished: bool = False + self.total_audio_bytes: int = 0 + self._is_stopped: bool = False + self.recorder_map: dict[str, PCMWriter] = {} + self._audio_start_sent: bool = False + + async def on_init(self, ten_env: AsyncTenEnv) -> None: + try: + await super().on_init(ten_env) + config_json_str, _ = await self.ten_env.get_property_to_json("") + + if not config_json_str or config_json_str.strip() == "{}": + raise ValueError( + "Configuration is empty. " + "Required parameter 'api_key' is missing." + ) + + self.config = DeepgramTTSConfig.model_validate_json(config_json_str) + self.config.update_params() + ten_env.log_info( + f"LOG_CATEGORY_KEY_POINT: " + f"{self.config.to_str(sensitive_handling=True)}", + category=LOG_CATEGORY_KEY_POINT, + ) + + if not self.config.api_key: + raise ValueError("API key is required") + + self.client = self._create_client(ten_env) + await self.client.start() + ten_env.log_debug("DeepgramTTS client initialized successfully") + except Exception as e: + ten_env.log_error(f"on_init failed: {traceback.format_exc()}") + await self.send_tts_error( + request_id="", + error=ModuleError( + message=f"Initialization failed: {e}", + module=ModuleType.TTS, + code=ModuleErrorCode.FATAL_ERROR, + vendor_info=ModuleErrorVendorInfo(vendor=self.vendor()), + ), + ) + + async def on_stop(self, ten_env: AsyncTenEnv) -> None: + self._is_stopped = True + ten_env.log_debug("Extension stopping, rejecting new requests") + + if self.client: + await self.client.stop() + self.client = None + + for request_id, recorder in list(self.recorder_map.items()): + try: + await recorder.flush() + ten_env.log_debug( + f"Flushed PCMWriter for request_id: " f"{request_id}" + ) + except Exception as e: + ten_env.log_error( + f"Error flushing PCMWriter for " + f"request_id {request_id}: {e}" + ) + + await super().on_stop(ten_env) + ten_env.log_debug("on_stop") + + async def on_deinit(self, ten_env: AsyncTenEnv) -> None: + await super().on_deinit(ten_env) + ten_env.log_debug("on_deinit") + + async def cancel_tts(self) -> None: + self.current_request_finished = True + if self.current_request_id: + self.ten_env.log_debug( + f"Current request {self.current_request_id} " + f"is being cancelled. Sending INTERRUPTED." + ) + if self.client: + await self.client.cancel() + if self.sent_ts: + await self._finalize_request(TTSAudioEndReason.INTERRUPTED) + else: + self.ten_env.log_warn( + "No current request found, " "skipping TTS cancellation." + ) + + def vendor(self) -> str: + return "deepgram" + + def synthesize_audio_sample_rate(self) -> int: + if self.config is None: + return 24000 + return self.config.sample_rate + + def _create_client(self, ten_env: AsyncTenEnv) -> DeepgramTTSClient: + return DeepgramTTSClient( + config=self.config, + ten_env=ten_env, + send_fatal_tts_error=self.send_fatal_tts_error, + send_non_fatal_tts_error=(self.send_non_fatal_tts_error), + ) + + async def _ensure_client(self) -> None: + """Ensure client is connected, reconnecting if needed.""" + if self.client is None: + self.ten_env.log_debug( + "TTS client is not initialized, reconnecting..." + ) + self.client = self._create_client(self.ten_env) + await self.client.start() + self.ten_env.log_debug("TTS client reconnected successfully.") + + async def _reconnect_client(self) -> None: + """Destroy current client and reconnect immediately.""" + if self.client: + await self.client.stop() + self.client = None + try: + self.client = self._create_client(self.ten_env) + await self.client.start() + self.ten_env.log_debug("Client reconnected after error.") + except Exception as e: + self.ten_env.log_error(f"Immediate reconnect failed: {e}") + self.client = None + + async def _finalize_request( + self, + reason: TTSAudioEndReason, + error: ModuleError | None = None, + ) -> None: + """Send audio end, flush recorder, finish request.""" + if not self._audio_start_sent: + await self.send_tts_audio_start( + request_id=self.current_request_id, + ) + self._audio_start_sent = True + + request_event_interval = self._current_request_interval_ms() + duration_ms = self._calculate_audio_duration_ms() + + await self.send_tts_audio_end( + request_id=self.current_request_id, + request_event_interval_ms=request_event_interval, + request_total_audio_duration_ms=duration_ms, + reason=reason, + ) + + if self.current_request_id in self.recorder_map: + await self.recorder_map[self.current_request_id].flush() + + await self.finish_request( + request_id=self.current_request_id, + reason=reason, + error=error, + ) + + self.sent_ts = None + self.ten_env.log_debug( + f"Finalized request, reason: {reason}, " + f"interval: {request_event_interval}ms, " + f"duration: {duration_ms}ms" + ) + + async def request_tts(self, t: TTSTextInput) -> None: + """Handle TTS requests.""" + try: + self.ten_env.log_info( + f"Requesting TTS for text: {t.text}, " + f"text_input_end: {t.text_input_end} " + f"request ID: {t.request_id}", + ) + + await self._ensure_client() + + if t.request_id != self.current_request_id: + self.ten_env.log_debug( + f"New TTS request with ID: {t.request_id}" + ) + if self.client: + self.client.reset_ttfb() + self.current_request_id = t.request_id + self.current_request_finished = False + self.total_audio_bytes = 0 + self.sent_ts = None + self._audio_start_sent = False + if t.metadata is not None: + self.session_id = t.metadata.get("session_id", "") + self.current_turn_id = t.metadata.get("turn_id", -1) + self._setup_recorder(t.request_id) + elif self.current_request_finished: + self.ten_env.log_error( + f"Received a message for a finished " + f"request_id '{t.request_id}' with " + f"text_input_end=False." + ) + return + + if t.text_input_end: + self.ten_env.log_debug( + f"KEYPOINT finish session for " + f"request ID: {t.request_id}" + ) + self.current_request_finished = True + + prepared_text = t.text.strip() + + if self._is_stopped: + self.ten_env.log_debug( + f"TTS is stopped, skipping " f"request_id: {t.request_id}" + ) + return + + if prepared_text != "": + await self._process_tts_text(prepared_text, t) + elif t.text_input_end: + await self._finalize_request(TTSAudioEndReason.REQUEST_END) + + except DeepgramTTSConnectionException as e: + await self._handle_connection_error(e) + + except Exception as e: + self.ten_env.log_error( + f"Error in request_tts: " + f"{traceback.format_exc()}. text: {t.text}" + ) + error = ModuleError( + message=str(e), + module=ModuleType.TTS, + code=ModuleErrorCode.NON_FATAL_ERROR, + vendor_info=ModuleErrorVendorInfo(vendor=self.vendor()), + ) + await self.send_tts_error( + request_id=self.current_request_id, + error=error, + ) + await self.finish_request( + request_id=self.current_request_id, + reason=TTSAudioEndReason.ERROR, + error=error, + ) + if isinstance(e, ConnectionRefusedError): + await self._reconnect_client() + + async def _process_tts_text(self, text: str, t: TTSTextInput) -> None: + """Process non-empty text through the TTS pipeline.""" + self.ten_env.log_debug( + f"send_text_to_tts_server: {text} " + f"of request_id: {t.request_id}", + category=LOG_CATEGORY_VENDOR, + ) + data = self.client.get(text) + + chunk_count = 0 + if self.sent_ts is None: + self.sent_ts = datetime.now() + + async for data_msg, event_status in data: + self.ten_env.log_debug(f"Received event_status: {event_status}") + if event_status == EVENT_TTS_RESPONSE: + if ( + data_msg is not None + and isinstance(data_msg, bytes) + and len(data_msg) > 0 + ): + chunk_count += 1 + self.total_audio_bytes += len(data_msg) + self.ten_env.log_info( + f"Received audio chunk " + f"#{chunk_count}, " + f"size: {len(data_msg)} bytes" + ) + self._write_dump(data_msg) + await self.send_tts_audio_data(data_msg) + else: + self.ten_env.log_debug( + "Received empty payload for " "TTS response" + ) + if t.text_input_end: + await self._finalize_request( + TTSAudioEndReason.REQUEST_END + ) + + elif event_status == EVENT_TTS_TTFB_METRIC: + if data_msg is not None and isinstance(data_msg, int): + self.sent_ts = datetime.now() + ttfb = data_msg + await self.send_tts_audio_start( + request_id=self.current_request_id, + ) + self._audio_start_sent = True + await self.send_tts_ttfb_metrics( + request_id=self.current_request_id, + ttfb_ms=ttfb, + extra_metadata={ + "model": self.config.model, + }, + ) + self.ten_env.log_debug( + f"Sent TTS audio start and " f"TTFB metrics: {ttfb}ms" + ) + + elif event_status == EVENT_TTS_END: + self.ten_env.log_info( + "Received TTS_END event from " "Deepgram TTS" + ) + if t.text_input_end: + await self._finalize_request(TTSAudioEndReason.REQUEST_END) + break + + elif event_status == EVENT_TTS_ERROR: + self.ten_env.log_error( + "Received TTS_ERROR event from " "Deepgram TTS" + ) + error_msg = ( + data_msg.decode("utf-8") + if isinstance(data_msg, bytes) + else str(data_msg) + ) + if t.text_input_end: + await self._finalize_request( + TTSAudioEndReason.ERROR, + error=ModuleError( + message=error_msg, + module=ModuleType.TTS, + code=(ModuleErrorCode.NON_FATAL_ERROR), + vendor_info=ModuleErrorVendorInfo( + vendor=self.vendor() + ), + ), + ) + break + + self.ten_env.log_debug( + f"TTS processing completed, " f"total chunks: {chunk_count}" + ) + + async def _handle_connection_error( + self, e: DeepgramTTSConnectionException + ) -> None: + """Handle Deepgram connection errors.""" + self.ten_env.log_error( + f"DeepgramTTSConnectionException in request_tts: " f"{e.body}" + ) + if e.status_code == 401: + code = ModuleErrorCode.FATAL_ERROR + else: + code = ModuleErrorCode.NON_FATAL_ERROR + + error = ModuleError( + message=e.body, + module=ModuleType.TTS, + code=code, + vendor_info=ModuleErrorVendorInfo( + vendor=self.vendor(), + code=str(e.status_code), + message=e.body, + ), + ) + await self.send_tts_error( + request_id=self.current_request_id, + error=error, + ) + await self.finish_request( + request_id=self.current_request_id, + reason=TTSAudioEndReason.ERROR, + error=error, + ) + + def _setup_recorder(self, request_id: str) -> None: + """Set up PCMWriter for a new request.""" + if not (self.config and self.config.dump): + return + # Clean up old PCMWriters + for old_rid in [ + rid for rid in self.recorder_map.keys() if rid != request_id + ]: + try: + asyncio.create_task(self.recorder_map[old_rid].flush()) + del self.recorder_map[old_rid] + self.ten_env.log_debug( + f"Cleaned up old PCMWriter for " f"request_id: {old_rid}" + ) + except Exception as e: + self.ten_env.log_error( + f"Error cleaning up PCMWriter for " + f"request_id {old_rid}: {e}" + ) + + if request_id not in self.recorder_map: + dump_file_path = os.path.join( + self.config.dump_path, + f"deepgram_dump_{request_id}.pcm", + ) + self.recorder_map[request_id] = PCMWriter(dump_file_path) + self.ten_env.log_debug( + f"Created PCMWriter for request_id: " + f"{request_id}, file: {dump_file_path}" + ) + + def _write_dump(self, data: bytes) -> None: + """Write audio data to dump file if enabled.""" + if ( + self.config + and self.config.dump + and self.current_request_id + and self.current_request_id in self.recorder_map + ): + asyncio.create_task( + self.recorder_map[self.current_request_id].write(data) + ) + + async def send_fatal_tts_error(self, error_message: str) -> None: + await self.send_tts_error( + request_id=self.current_request_id or "", + error=ModuleError( + message=error_message, + module=ModuleType.TTS, + code=ModuleErrorCode.FATAL_ERROR, + vendor_info=ModuleErrorVendorInfo(vendor=self.vendor()), + ), + ) + + async def send_non_fatal_tts_error(self, error_message: str) -> None: + await self.send_tts_error( + request_id=self.current_request_id or "", + error=ModuleError( + message=error_message, + module=ModuleType.TTS, + code=ModuleErrorCode.NON_FATAL_ERROR, + vendor_info=ModuleErrorVendorInfo(vendor=self.vendor()), + ), + ) + + def _current_request_interval_ms(self) -> int: + if not self.sent_ts: + return 0 + return int((datetime.now() - self.sent_ts).total_seconds() * 1000) + + def _calculate_audio_duration_ms(self) -> int: + if self.config is None: + return 0 + bytes_per_sample = 2 # 16-bit PCM + channels = 1 # Mono + duration_sec = self.total_audio_bytes / ( + self.synthesize_audio_sample_rate() * bytes_per_sample * channels + ) + return int(duration_sec * 1000) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/manifest.json b/ai_agents/agents/ten_packages/extension/deepgram_tts/manifest.json new file mode 100644 index 0000000000..c2ef9bb7a0 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/manifest.json @@ -0,0 +1,65 @@ +{ + "type": "extension", + "name": "deepgram_tts", + "version": "0.1.0", + "dependencies": [ + { + "type": "system", + "name": "ten_runtime_python", + "version": "0.11" + }, + { + "type": "system", + "name": "ten_ai_base", + "version": "0.7" + } + ], + "package": { + "include": [ + "manifest.json", + "property.json", + "BUILD.gn", + "**.tent", + "**.py", + "README.md", + "requirements.txt" + ] + }, + "api": { + "interface": [ + { + "import_uri": "../../system/ten_ai_base/api/tts-interface.json" + } + ], + "property": { + "properties": { + "dump": { + "type": "bool" + }, + "dump_path": { + "type": "string" + }, + "params": { + "type": "object", + "properties": { + "api_key": { + "type": "string" + }, + "base_url": { + "type": "string" + }, + "model": { + "type": "string" + }, + "encoding": { + "type": "string" + }, + "sample_rate": { + "type": "int32" + } + } + } + } + } + } +} diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/property.json b/ai_agents/agents/ten_packages/extension/deepgram_tts/property.json new file mode 100644 index 0000000000..313cff84f4 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/property.json @@ -0,0 +1,11 @@ +{ + "dump": false, + "dump_path": "/tmp", + "params": { + "api_key": "${env:DEEPGRAM_API_KEY}", + "base_url": "wss://api.deepgram.com/v1/speak", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000 + } +} diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/requirements.txt b/ai_agents/agents/ten_packages/extension/deepgram_tts/requirements.txt new file mode 100644 index 0000000000..31b5e2f348 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/requirements.txt @@ -0,0 +1 @@ +websockets>=12.0 diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/__init__.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/__init__.py new file mode 100644 index 0000000000..da402faf43 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/__init__.py @@ -0,0 +1,5 @@ +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/bin/start b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/bin/start new file mode 100755 index 0000000000..41da3fdb45 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/bin/start @@ -0,0 +1,21 @@ +#!/bin/bash + +set -e + +cd "$(dirname "${BASH_SOURCE[0]}")/../.." + +export PYTHONPATH=.ten/app:.ten/app/ten_packages/system/ten_runtime_python/lib:.ten/app/ten_packages/system/ten_runtime_python/interface:.ten/app/ten_packages/system/ten_ai_base/interface:$PYTHONPATH + +# If the Python app imports some modules that are compiled with a different +# version of libstdc++ (ex: PyTorch), the Python app may encounter confusing +# errors. To solve this problem, we can preload the correct version of +# libstdc++. +# +# export LD_PRELOAD=/lib/x86_64-linux-gnu/libstdc++.so.6 +# +# Another solution is to make sure the module 'ten_runtime_python' is imported +# _after_ the module that requires another version of libstdc++ is imported. +# +# Refer to https://github.com/pytorch/pytorch/issues/102360?from_wecom=1#issuecomment-1708989096 + +pytest tests/ "$@" diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_basic_audio_setting1.json b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_basic_audio_setting1.json new file mode 100644 index 0000000000..ff0a081e87 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_basic_audio_setting1.json @@ -0,0 +1,10 @@ +{ + "dump": true, + "dump_path": "./tests/keep_dump_output/", + "params": { + "api_key": "${env:DEEPGRAM_API_KEY}", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000 + } +} diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_basic_audio_setting2.json b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_basic_audio_setting2.json new file mode 100644 index 0000000000..c753384856 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_basic_audio_setting2.json @@ -0,0 +1,10 @@ +{ + "dump": true, + "dump_path": "./tests/keep_dump_output/", + "params": { + "api_key": "${env:DEEPGRAM_API_KEY}", + "model": "aura-2-luna-en", + "encoding": "linear16", + "sample_rate": 16000 + } +} diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_dump.json b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_dump.json new file mode 100644 index 0000000000..4690fecb76 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_dump.json @@ -0,0 +1,10 @@ +{ + "dump": true, + "dump_path": "./dump/", + "params": { + "api_key": "${env:DEEPGRAM_API_KEY}", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000 + } +} diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_invalid.json b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_invalid.json new file mode 100644 index 0000000000..6233cf106a --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_invalid.json @@ -0,0 +1,5 @@ +{ + "params": { + "api_key": "invalid" + } +} diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_miss_required.json b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_miss_required.json new file mode 100644 index 0000000000..df133e721a --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/configs/property_miss_required.json @@ -0,0 +1,5 @@ +{ + "params": { + "api_key": "" + } +} diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/conftest.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/conftest.py new file mode 100644 index 0000000000..001977148c --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/conftest.py @@ -0,0 +1,99 @@ +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +import json +import threading +from typing_extensions import override +import pytest +from ten_runtime import ( + App, + TenEnv, +) + + +class FakeApp(App): + def __init__(self): + super().__init__() + self.event: threading.Event | None = None + + # In the case of a fake app, we use `on_init` to allow the blocked testing + # fixture to continue execution, rather than using `on_configure`. The + # reason is that in the TEN runtime C core, the relationship between the + # addon manager and the (fake) app is bound after `on_configure_done` is + # called. So we only need to let the testing fixture continue execution + # after this action in the TEN runtime C core, and at the upper layer + # timing, the earliest point is within the `on_init()` function of the upper + # TEN app. Therefore, we release the testing fixture lock within the user + # layer's `on_init()` of the TEN app. + @override + def on_init(self, ten_env: TenEnv) -> None: + assert self.event + self.event.set() + + ten_env.on_init_done() + + @override + def on_configure(self, ten_env: TenEnv) -> None: + ten_env.init_property_from_json( + json.dumps( + { + "ten": { + "log": { + "handlers": [ + { + "matchers": [{"level": "debug"}], + "formatter": { + "type": "plain", + "colored": True, + }, + "emitter": { + "type": "console", + "config": {"stream": "stdout"}, + }, + } + ] + } + } + } + ), + ) + + ten_env.on_configure_done() + + +class FakeAppCtx: + def __init__(self, event: threading.Event): + self.fake_app: FakeApp | None = None + self.event = event + + +def run_fake_app(fake_app_ctx: FakeAppCtx): + app = FakeApp() + app.event = fake_app_ctx.event + fake_app_ctx.fake_app = app + app.run(False) + + +@pytest.fixture(scope="session", autouse=True) +def global_setup_and_teardown(): + event = threading.Event() + fake_app_ctx = FakeAppCtx(event) + + fake_app_thread = threading.Thread( + target=run_fake_app, args=(fake_app_ctx,) + ) + fake_app_thread.start() + + event.wait() + + assert fake_app_ctx.fake_app is not None + + # Yield control to the test; after the test execution is complete, continue + # with the teardown process. + yield + + # Teardown part. + fake_app_ctx.fake_app.close() + fake_app_thread.join() diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py new file mode 100644 index 0000000000..d47e898a16 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py @@ -0,0 +1,325 @@ +import sys +from pathlib import Path + +# Add project root to sys.path to allow running tests from this directory +# The project root is 6 levels up from the parent directory of this file. +project_root = str(Path(__file__).resolve().parents[6]) +if project_root not in sys.path: + sys.path.insert(0, project_root) + +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +from pathlib import Path +import json +from unittest.mock import patch, AsyncMock +import os +import asyncio +import filecmp +import shutil + +from ten_runtime import ( + ExtensionTester, + TenEnvTester, + Data, +) +from ten_ai_base.struct import TTSTextInput, TTSFlush +from deepgram_tts.deepgram_tts import ( + EVENT_TTS_RESPONSE, + EVENT_TTS_END, + EVENT_TTS_FLUSH, + EVENT_TTS_TTFB_METRIC, +) + + +# ================ test dump file functionality ================ +class ExtensionTesterDump(ExtensionTester): + def __init__(self): + super().__init__() + self.dump_dir = "./dump/" + self.test_dump_file_path = os.path.join( + self.dump_dir, "test_manual_dump.pcm" + ) + self.audio_end_received = False + self.received_audio_chunks = [] + + def on_start(self, ten_env_tester: TenEnvTester) -> None: + ten_env_tester.log_info("Dump test started, sending TTS request.") + + tts_input = TTSTextInput( + request_id="tts_request_1", + text="hello word, hello agora", + text_input_end=True, + ) + data = Data.create("tts_text_input") + data.set_property_from_json(None, tts_input.model_dump_json()) + ten_env_tester.send_data(data) + ten_env_tester.on_start_done() + + def on_data(self, ten_env: TenEnvTester, data) -> None: + name = data.get_name() + if name == "tts_audio_end": + ten_env.log_info("Received tts_audio_end, stopping test.") + self.audio_end_received = True + ten_env.stop_test() + + def on_audio_frame(self, ten_env: TenEnvTester, audio_frame): + buf = audio_frame.lock_buf() + try: + copied_data = bytes(buf) + self.received_audio_chunks.append(copied_data) + finally: + audio_frame.unlock_buf(buf) + + def write_test_dump_file(self): + with open(self.test_dump_file_path, "wb") as f: + for chunk in self.received_audio_chunks: + f.write(chunk) + + def find_tts_dump_file(self) -> str | None: + if not os.path.exists(self.dump_dir): + return None + for filename in os.listdir(self.dump_dir): + if filename.endswith(".pcm") and filename != os.path.basename( + self.test_dump_file_path + ): + return os.path.join(self.dump_dir, filename) + return None + + +@patch("deepgram_tts.extension.DeepgramTTSClient") +def test_dump_functionality(MockDeepgramTTSClient): + """Tests that the dump file from the TTS extension matches the audio received.""" + print("Starting test_dump_functionality with mock...") + + DUMP_PATH = "./dump/" + + if os.path.exists(DUMP_PATH): + shutil.rmtree(DUMP_PATH) + os.makedirs(DUMP_PATH) + + mock_instance = MockDeepgramTTSClient.return_value + mock_instance.start = AsyncMock() + mock_instance.stop = AsyncMock() + mock_instance.cancel = AsyncMock() + mock_instance.reset_ttfb = lambda: None + + fake_audio_chunk_1 = b"\x11\x22\x33\x44" * 20 + fake_audio_chunk_2 = b"\xaa\xbb\xcc\xdd" * 20 + + async def mock_get_audio_stream(text: str): + yield (255, EVENT_TTS_TTFB_METRIC) + yield (fake_audio_chunk_1, EVENT_TTS_RESPONSE) + await asyncio.sleep(0.01) + yield (fake_audio_chunk_2, EVENT_TTS_RESPONSE) + await asyncio.sleep(0.01) + yield (None, EVENT_TTS_END) + + mock_instance.get.side_effect = mock_get_audio_stream + + tester = ExtensionTesterDump() + + dump_config = { + "dump": True, + "dump_path": DUMP_PATH, + "params": { + "api_key": "test_api_key", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000, + }, + } + + tester.set_test_mode_single("deepgram_tts", json.dumps(dump_config)) + + print("Running dump test...") + tester.run() + print("Dump test completed.") + + assert tester.audio_end_received, "Expected to receive tts_audio_end" + assert ( + len(tester.received_audio_chunks) > 0 + ), "Expected to receive audio chunks" + + tester.write_test_dump_file() + + tts_dump_file = tester.find_tts_dump_file() + assert ( + tts_dump_file is not None + ), f"Expected to find a TTS dump file in {DUMP_PATH}" + assert os.path.exists( + tts_dump_file + ), f"TTS dump file should exist: {tts_dump_file}" + + print( + f"Comparing test file {tester.test_dump_file_path} with TTS dump file {tts_dump_file}" + ) + assert filecmp.cmp( + tester.test_dump_file_path, tts_dump_file, shallow=False + ), "Test dump file and TTS dump file should have the same content" + + print( + f"Dump test passed: received {len(tester.received_audio_chunks)} audio chunks" + ) + + if os.path.exists(DUMP_PATH): + shutil.rmtree(DUMP_PATH) + + +# ================ test basic audio output ================ +class ExtensionTesterBasic(ExtensionTester): + def __init__(self): + super().__init__() + self.audio_start_received = False + self.audio_end_received = False + self.audio_chunks_count = 0 + + def on_start(self, ten_env_tester: TenEnvTester) -> None: + ten_env_tester.log_info("Basic test started, sending TTS request.") + + tts_input = TTSTextInput( + request_id="tts_request_basic", + text="Hello, this is a test of the Deepgram TTS extension.", + text_input_end=True, + ) + data = Data.create("tts_text_input") + data.set_property_from_json(None, tts_input.model_dump_json()) + ten_env_tester.send_data(data) + ten_env_tester.on_start_done() + + def on_data(self, ten_env: TenEnvTester, data) -> None: + name = data.get_name() + if name == "tts_audio_start": + ten_env.log_info("Received tts_audio_start.") + self.audio_start_received = True + elif name == "tts_audio_end": + ten_env.log_info("Received tts_audio_end, stopping test.") + self.audio_end_received = True + ten_env.stop_test() + + def on_audio_frame(self, ten_env: TenEnvTester, audio_frame): + self.audio_chunks_count += 1 + + +@patch("deepgram_tts.extension.DeepgramTTSClient") +def test_basic_audio(MockDeepgramTTSClient): + """Test basic TTS audio generation.""" + mock_instance = MockDeepgramTTSClient.return_value + mock_instance.start = AsyncMock() + mock_instance.stop = AsyncMock() + mock_instance.cancel = AsyncMock() + mock_instance.reset_ttfb = lambda: None + + fake_audio_chunk = b"\x00\x01\x02\x03" * 100 + + async def mock_get_audio_stream(text: str): + yield (150, EVENT_TTS_TTFB_METRIC) + yield (fake_audio_chunk, EVENT_TTS_RESPONSE) + yield (None, EVENT_TTS_END) + + mock_instance.get.side_effect = mock_get_audio_stream + + tester = ExtensionTesterBasic() + tester.set_test_mode_single( + "deepgram_tts", + json.dumps( + { + "params": { + "api_key": "test_api_key", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000, + }, + } + ), + ) + + tester.run() + + assert tester.audio_start_received, "tts_audio_start was not received." + assert tester.audio_end_received, "tts_audio_end was not received." + assert tester.audio_chunks_count > 0, "No audio chunks received." + + +# ================ test flush functionality ================ +class ExtensionTesterFlush(ExtensionTester): + def __init__(self): + super().__init__() + self.audio_end_received = False + + def on_start(self, ten_env_tester: TenEnvTester) -> None: + ten_env_tester.log_info("Flush test started.") + + tts_input = TTSTextInput( + request_id="tts_request_flush", + text="This is the first sentence.", + text_input_end=False, + ) + data = Data.create("tts_text_input") + data.set_property_from_json(None, tts_input.model_dump_json()) + ten_env_tester.send_data(data) + + flush = TTSFlush(flush_id="flush_1") + flush_data = Data.create("tts_flush") + flush_data.set_property_from_json(None, flush.model_dump_json()) + ten_env_tester.send_data(flush_data) + + tts_input2 = TTSTextInput( + request_id="tts_request_flush", + text="This is the final sentence.", + text_input_end=True, + ) + data2 = Data.create("tts_text_input") + data2.set_property_from_json(None, tts_input2.model_dump_json()) + ten_env_tester.send_data(data2) + + ten_env_tester.on_start_done() + + def on_data(self, ten_env: TenEnvTester, data) -> None: + name = data.get_name() + if name == "tts_audio_end": + ten_env.log_info("Received tts_audio_end, stopping test.") + self.audio_end_received = True + ten_env.stop_test() + + +@patch("deepgram_tts.extension.DeepgramTTSClient") +def test_flush(MockDeepgramTTSClient): + """Test TTS flush functionality.""" + mock_instance = MockDeepgramTTSClient.return_value + mock_instance.start = AsyncMock() + mock_instance.stop = AsyncMock() + mock_instance.cancel = AsyncMock() + mock_instance.reset_ttfb = lambda: None + + fake_audio_chunk = b"\x00\x01\x02\x03" * 50 + + async def mock_get_audio_stream(text: str): + yield (100, EVENT_TTS_TTFB_METRIC) + yield (fake_audio_chunk, EVENT_TTS_RESPONSE) + yield (None, EVENT_TTS_END) + + mock_instance.get.side_effect = mock_get_audio_stream + + tester = ExtensionTesterFlush() + tester.set_test_mode_single( + "deepgram_tts", + json.dumps( + { + "params": { + "api_key": "test_api_key", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000, + }, + } + ), + ) + + tester.run() + + assert ( + tester.audio_end_received + ), "tts_audio_end was not received after flush." diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_error_msg.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_error_msg.py new file mode 100644 index 0000000000..26e5cccf70 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_error_msg.py @@ -0,0 +1,174 @@ +import sys +from pathlib import Path + +# Add project root to sys.path +project_root = str(Path(__file__).resolve().parents[6]) +if project_root not in sys.path: + sys.path.insert(0, project_root) + +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +import json +from unittest.mock import patch, AsyncMock, MagicMock + +from ten_runtime import ( + ExtensionTester, + TenEnvTester, + Data, +) +from ten_ai_base.struct import TTSTextInput + + +# ================ test empty params ================ +class ExtensionTesterEmptyParams(ExtensionTester): + def __init__(self): + super().__init__() + self.error_received = False + self.error_code = None + self.error_message = None + self.error_module = None + + def on_start(self, ten_env_tester: TenEnvTester) -> None: + """Called when test starts""" + ten_env_tester.log_info("Test started") + ten_env_tester.on_start_done() + + def on_data(self, ten_env: TenEnvTester, data) -> None: + name = data.get_name() + ten_env.log_info(f"on_data name: {name}") + + if name == "error": + self.error_received = True + json_str, _ = data.get_property_to_json(None) + error_data = json.loads(json_str) + + self.error_code = error_data.get("code") + self.error_message = error_data.get("message", "") + self.error_module = error_data.get("module", "") + + ten_env.log_info( + f"Received error: code={self.error_code}, message={self.error_message}" + ) + ten_env.stop_test() + + +def test_empty_params_fatal_error(): + """Test that empty params raises FATAL ERROR with code -1000""" + print("Starting test_empty_params_fatal_error...") + + # Empty params configuration + empty_params_config = { + "params": { + "api_key": "", + } + } + + tester = ExtensionTesterEmptyParams() + tester.set_test_mode_single("deepgram_tts", json.dumps(empty_params_config)) + + print("Running test...") + tester.run() + print("Test completed.") + + # Verify FATAL ERROR was received + assert tester.error_received, "Expected to receive error message" + assert ( + tester.error_code == -1000 + ), f"Expected error code -1000 (FATAL_ERROR), got {tester.error_code}" + assert tester.error_message is not None, "Error message should not be None" + assert len(tester.error_message) > 0, "Error message should not be empty" + + print(f"Empty params test passed: code={tester.error_code}") + + +# ================ test invalid api key ================ +class ExtensionTesterInvalidApiKey(ExtensionTester): + def __init__(self): + super().__init__() + self.error_received = False + self.error_code = None + self.error_message = None + self.vendor_info = None + + def on_start(self, ten_env_tester: TenEnvTester) -> None: + """Called when test starts, sends a TTS request to trigger the logic.""" + ten_env_tester.log_info( + "Invalid API key test started, sending TTS request" + ) + + tts_input = TTSTextInput( + request_id="test-request-invalid-key", + text="This text will trigger API key validation.", + text_input_end=True, + ) + data = Data.create("tts_text_input") + data.set_property_from_json(None, tts_input.model_dump_json()) + ten_env_tester.send_data(data) + + ten_env_tester.on_start_done() + + def on_data(self, ten_env: TenEnvTester, data) -> None: + name = data.get_name() + ten_env.log_info(f"on_data name: {name}") + + if name == "error": + self.error_received = True + json_str, _ = data.get_property_to_json(None) + error_data = json.loads(json_str) + + self.error_code = error_data.get("code") + self.error_message = error_data.get("message", "") + self.vendor_info = error_data.get("vendor_info", {}) + + ten_env.log_info( + f"Received error: code={self.error_code}, message={self.error_message}" + ) + ten_env.stop_test() + elif name == "tts_audio_end": + ten_env.stop_test() + + +@patch("deepgram_tts.deepgram_tts.websockets.connect") +def test_invalid_api_key_error(mock_websocket_connect): + """Test that an invalid API key is handled correctly with a mock.""" + print("Starting test_invalid_api_key_error with mock...") + + # Mock websocket to raise 401 unauthorized error + mock_websocket_connect.side_effect = Exception( + "401 Unauthorized - Invalid API key" + ) + + # Config with invalid API key + invalid_key_config = { + "params": { + "api_key": "invalid_api_key_test", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000, + }, + } + + tester = ExtensionTesterInvalidApiKey() + tester.set_test_mode_single("deepgram_tts", json.dumps(invalid_key_config)) + + print("Running test with mock...") + tester.run() + print("Test with mock completed.") + + # Verify FATAL ERROR was received for incorrect API key + assert tester.error_received, "Expected to receive error message" + assert ( + tester.error_code == -1000 + ), f"Expected error code -1000 (FATAL_ERROR), got {tester.error_code}" + + # Verify vendor_info + vendor_info = tester.vendor_info + assert vendor_info is not None, "Expected vendor_info to be present" + assert ( + vendor_info.get("vendor") == "deepgram" + ), f"Expected vendor 'deepgram', got {vendor_info.get('vendor')}" + + print(f"Invalid API key test passed: code={tester.error_code}") diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_metrics.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_metrics.py new file mode 100644 index 0000000000..3705c130f4 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_metrics.py @@ -0,0 +1,135 @@ +import sys +from pathlib import Path + +# Add project root to sys.path +project_root = str(Path(__file__).resolve().parents[6]) +if project_root not in sys.path: + sys.path.insert(0, project_root) + +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +import json +from unittest.mock import patch, AsyncMock +import asyncio + +from ten_runtime import ( + ExtensionTester, + TenEnvTester, + Data, +) +from ten_ai_base.struct import TTSTextInput +from deepgram_tts.deepgram_tts import ( + EVENT_TTS_RESPONSE, + EVENT_TTS_END, + EVENT_TTS_TTFB_METRIC, +) + + +# ================ test metrics ================ +class ExtensionTesterMetrics(ExtensionTester): + def __init__(self): + super().__init__() + self.ttfb_received = False + self.ttfb_value = -1 + self.audio_frame_received = False + self.audio_end_received = False + + def on_start(self, ten_env_tester: TenEnvTester) -> None: + """Called when test starts, sends a TTS request.""" + ten_env_tester.log_info("Metrics test started, sending TTS request.") + + tts_input = TTSTextInput( + request_id="tts_request_for_metrics", + text="hello, this is a metrics test.", + text_input_end=True, + ) + data = Data.create("tts_text_input") + data.set_property_from_json(None, tts_input.model_dump_json()) + ten_env_tester.send_data(data) + ten_env_tester.on_start_done() + + def on_data(self, ten_env: TenEnvTester, data) -> None: + name = data.get_name() + ten_env.log_info(f"on_data name: {name}") + if name == "metrics": + json_str, _ = data.get_property_to_json(None) + ten_env.log_info(f"Received metrics: {json_str}") + metrics_data = json.loads(json_str) + + # According to the structure, 'ttfb' is nested inside a 'metrics' object. + nested_metrics = metrics_data.get("metrics", {}) + if "ttfb" in nested_metrics: + self.ttfb_received = True + self.ttfb_value = nested_metrics.get("ttfb", -1) + ten_env.log_info( + f"Received TTFB metric with value: {self.ttfb_value}" + ) + + elif name == "tts_audio_end": + self.audio_end_received = True + # Stop the test only after both TTFB and audio end are received + if self.ttfb_received: + ten_env.log_info("Received tts_audio_end, stopping test.") + ten_env.stop_test() + + def on_audio_frame(self, ten_env: TenEnvTester, audio_frame): + """Receives audio frames and confirms the stream is working.""" + if not self.audio_frame_received: + self.audio_frame_received = True + ten_env.log_info("First audio frame received.") + + +@patch("deepgram_tts.extension.DeepgramTTSClient") +def test_ttfb_metric_is_sent(MockDeepgramTTSClient): + """ + Tests that a TTFB (Time To First Byte) metric is correctly sent after + receiving the first audio chunk from the TTS service. + """ + print("Starting test_ttfb_metric_is_sent with mock...") + + # --- Mock Configuration --- + mock_instance = MockDeepgramTTSClient.return_value + mock_instance.start = AsyncMock() + mock_instance.stop = AsyncMock() + mock_instance.cancel = AsyncMock() + mock_instance.reset_ttfb = lambda: None + + # This async generator simulates the TTS client's get() method with a delay + async def mock_get_audio_with_delay(text: str): + await asyncio.sleep(0.2) + yield (255, EVENT_TTS_TTFB_METRIC) + yield (b"\x11\x22\x33", EVENT_TTS_RESPONSE) + yield (None, EVENT_TTS_END) + + mock_instance.get.side_effect = mock_get_audio_with_delay + + # --- Test Setup --- + metrics_config = { + "params": { + "api_key": "test_api_key", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000, + } + } + tester = ExtensionTesterMetrics() + tester.set_test_mode_single("deepgram_tts", json.dumps(metrics_config)) + + print("Running TTFB metrics test...") + tester.run() + print("TTFB metrics test completed.") + + # --- Assertions --- + assert tester.audio_frame_received, "Did not receive any audio frame." + assert tester.audio_end_received, "Did not receive the tts_audio_end event." + assert tester.ttfb_received, "TTFB metric was not received." + + # Check if the TTFB value matches what we sent + assert ( + tester.ttfb_value == 255 + ), f"Expected TTFB to be 255ms, but got {tester.ttfb_value}ms." + + print(f"TTFB metric test passed. Received TTFB: {tester.ttfb_value}ms.") diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_params.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_params.py new file mode 100644 index 0000000000..d597cd6a52 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_params.py @@ -0,0 +1,157 @@ +import sys +from pathlib import Path + +# Add project root to sys.path +project_root = str(Path(__file__).resolve().parents[6]) +if project_root not in sys.path: + sys.path.insert(0, project_root) + +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +import json +from unittest.mock import patch, AsyncMock + +from ten_runtime import ( + ExtensionTester, + TenEnvTester, + Data, +) +from ten_ai_base.struct import TTSTextInput +from deepgram_tts.deepgram_tts import ( + EVENT_TTS_RESPONSE, + EVENT_TTS_END, + EVENT_TTS_TTFB_METRIC, +) + + +# ================ test different sample rates ================ +class ExtensionTesterSampleRate(ExtensionTester): + def __init__(self, sample_rate: int): + super().__init__() + self.sample_rate = sample_rate + self.audio_end_received = False + self.audio_chunks_count = 0 + + def on_start(self, ten_env_tester: TenEnvTester) -> None: + ten_env_tester.log_info(f"Sample rate test: {self.sample_rate}Hz") + + tts_input = TTSTextInput( + request_id="tts_request_sr", + text="Testing different sample rates.", + text_input_end=True, + ) + data = Data.create("tts_text_input") + data.set_property_from_json(None, tts_input.model_dump_json()) + ten_env_tester.send_data(data) + ten_env_tester.on_start_done() + + def on_data(self, ten_env: TenEnvTester, data) -> None: + name = data.get_name() + if name == "tts_audio_end": + self.audio_end_received = True + ten_env.stop_test() + + def on_audio_frame(self, ten_env: TenEnvTester, audio_frame): + self.audio_chunks_count += 1 + + +def _create_mock_client(): + """Helper to create a mock client for tests.""" + from unittest.mock import MagicMock + + mock = MagicMock() + mock.start = AsyncMock() + mock.stop = AsyncMock() + mock.cancel = AsyncMock() + mock.reset_ttfb = lambda: None + + fake_audio_chunk = b"\x00\x01\x02\x03" * 100 + + async def mock_get_audio_stream(text: str): + yield (100, EVENT_TTS_TTFB_METRIC) + yield (fake_audio_chunk, EVENT_TTS_RESPONSE) + yield (None, EVENT_TTS_END) + + mock.get.side_effect = mock_get_audio_stream + return mock + + +@patch("deepgram_tts.extension.DeepgramTTSClient") +def test_sample_rate_16000(MockDeepgramTTSClient): + """Test with 16000 Hz sample rate.""" + MockDeepgramTTSClient.return_value = _create_mock_client() + + tester = ExtensionTesterSampleRate(16000) + tester.set_test_mode_single( + "deepgram_tts", + json.dumps( + { + "params": { + "api_key": "test_api_key", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 16000, + }, + } + ), + ) + + tester.run() + + assert tester.audio_end_received, "tts_audio_end was not received." + assert tester.audio_chunks_count > 0, "No audio chunks received." + + +@patch("deepgram_tts.extension.DeepgramTTSClient") +def test_sample_rate_24000(MockDeepgramTTSClient): + """Test with 24000 Hz sample rate.""" + MockDeepgramTTSClient.return_value = _create_mock_client() + + tester = ExtensionTesterSampleRate(24000) + tester.set_test_mode_single( + "deepgram_tts", + json.dumps( + { + "params": { + "api_key": "test_api_key", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000, + }, + } + ), + ) + + tester.run() + + assert tester.audio_end_received, "tts_audio_end was not received." + assert tester.audio_chunks_count > 0, "No audio chunks received." + + +@patch("deepgram_tts.extension.DeepgramTTSClient") +def test_sample_rate_48000(MockDeepgramTTSClient): + """Test with 48000 Hz sample rate.""" + MockDeepgramTTSClient.return_value = _create_mock_client() + + tester = ExtensionTesterSampleRate(48000) + tester.set_test_mode_single( + "deepgram_tts", + json.dumps( + { + "params": { + "api_key": "test_api_key", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 48000, + }, + } + ), + ) + + tester.run() + + assert tester.audio_end_received, "tts_audio_end was not received." + assert tester.audio_chunks_count > 0, "No audio chunks received." diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_robustness.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_robustness.py new file mode 100644 index 0000000000..b807fe5834 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_robustness.py @@ -0,0 +1,277 @@ +import sys +from pathlib import Path + +# Add project root to sys.path +project_root = str(Path(__file__).resolve().parents[6]) +if project_root not in sys.path: + sys.path.insert(0, project_root) + +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +import json +from unittest.mock import patch, AsyncMock + +from ten_runtime import ( + ExtensionTester, + TenEnvTester, + Data, +) +from ten_ai_base.struct import TTSTextInput +from deepgram_tts.deepgram_tts import ( + EVENT_TTS_RESPONSE, + EVENT_TTS_END, + EVENT_TTS_TTFB_METRIC, +) + + +def _create_mock_client(): + """Helper to create a mock client for tests.""" + from unittest.mock import MagicMock + + mock = MagicMock() + mock.start = AsyncMock() + mock.stop = AsyncMock() + mock.cancel = AsyncMock() + mock.reset_ttfb = lambda: None + + fake_audio_chunk = b"\x00\x01\x02\x03" * 100 + + async def mock_get_audio_stream(text: str): + yield (100, EVENT_TTS_TTFB_METRIC) + yield (fake_audio_chunk, EVENT_TTS_RESPONSE) + yield (None, EVENT_TTS_END) + + mock.get.side_effect = mock_get_audio_stream + return mock + + +# ================ test empty text ================ +class ExtensionTesterEmptyText(ExtensionTester): + def __init__(self): + super().__init__() + self.audio_end_received = False + + def on_start(self, ten_env_tester: TenEnvTester) -> None: + ten_env_tester.log_info("Empty text test started.") + + tts_input = TTSTextInput( + request_id="tts_request_empty", + text="", + text_input_end=True, + ) + data = Data.create("tts_text_input") + data.set_property_from_json(None, tts_input.model_dump_json()) + ten_env_tester.send_data(data) + ten_env_tester.on_start_done() + + def on_data(self, ten_env: TenEnvTester, data) -> None: + name = data.get_name() + if name == "tts_audio_end": + ten_env.log_info("Received tts_audio_end for empty text.") + self.audio_end_received = True + ten_env.stop_test() + + +@patch("deepgram_tts.extension.DeepgramTTSClient") +def test_empty_text(MockDeepgramTTSClient): + """Test that empty text is handled gracefully.""" + MockDeepgramTTSClient.return_value = _create_mock_client() + + tester = ExtensionTesterEmptyText() + tester.set_test_mode_single( + "deepgram_tts", + json.dumps( + { + "params": { + "api_key": "test_api_key", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000, + }, + } + ), + ) + + tester.run() + + assert ( + tester.audio_end_received + ), "tts_audio_end should be sent for empty text." + + +# ================ test whitespace only text ================ +class ExtensionTesterWhitespaceText(ExtensionTester): + def __init__(self): + super().__init__() + self.audio_end_received = False + + def on_start(self, ten_env_tester: TenEnvTester) -> None: + ten_env_tester.log_info("Whitespace text test started.") + + tts_input = TTSTextInput( + request_id="tts_request_whitespace", + text=" \n\t ", + text_input_end=True, + ) + data = Data.create("tts_text_input") + data.set_property_from_json(None, tts_input.model_dump_json()) + ten_env_tester.send_data(data) + ten_env_tester.on_start_done() + + def on_data(self, ten_env: TenEnvTester, data) -> None: + name = data.get_name() + if name == "tts_audio_end": + ten_env.log_info("Received tts_audio_end for whitespace text.") + self.audio_end_received = True + ten_env.stop_test() + + +@patch("deepgram_tts.extension.DeepgramTTSClient") +def test_whitespace_text(MockDeepgramTTSClient): + """Test that whitespace-only text is handled gracefully.""" + MockDeepgramTTSClient.return_value = _create_mock_client() + + tester = ExtensionTesterWhitespaceText() + tester.set_test_mode_single( + "deepgram_tts", + json.dumps( + { + "params": { + "api_key": "test_api_key", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000, + }, + } + ), + ) + + tester.run() + + assert ( + tester.audio_end_received + ), "tts_audio_end should be sent for whitespace text." + + +# ================ test long text ================ +class ExtensionTesterLongText(ExtensionTester): + def __init__(self): + super().__init__() + self.audio_end_received = False + self.audio_chunks_count = 0 + + def on_start(self, ten_env_tester: TenEnvTester) -> None: + ten_env_tester.log_info("Long text test started.") + + long_text = "This is a longer piece of text. " * 20 + + tts_input = TTSTextInput( + request_id="tts_request_long", + text=long_text, + text_input_end=True, + ) + data = Data.create("tts_text_input") + data.set_property_from_json(None, tts_input.model_dump_json()) + ten_env_tester.send_data(data) + ten_env_tester.on_start_done() + + def on_data(self, ten_env: TenEnvTester, data) -> None: + name = data.get_name() + if name == "tts_audio_end": + ten_env.log_info("Received tts_audio_end for long text.") + self.audio_end_received = True + ten_env.stop_test() + + def on_audio_frame(self, ten_env: TenEnvTester, audio_frame): + self.audio_chunks_count += 1 + + +@patch("deepgram_tts.extension.DeepgramTTSClient") +def test_long_text(MockDeepgramTTSClient): + """Test that long text is handled correctly.""" + MockDeepgramTTSClient.return_value = _create_mock_client() + + tester = ExtensionTesterLongText() + tester.set_test_mode_single( + "deepgram_tts", + json.dumps( + { + "params": { + "api_key": "test_api_key", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000, + }, + } + ), + ) + + tester.run() + + assert ( + tester.audio_end_received + ), "tts_audio_end was not received for long text." + assert ( + tester.audio_chunks_count > 0 + ), "No audio chunks received for long text." + + +# ================ test special characters ================ +class ExtensionTesterSpecialChars(ExtensionTester): + def __init__(self): + super().__init__() + self.audio_end_received = False + self.error_received = False + + def on_start(self, ten_env_tester: TenEnvTester) -> None: + ten_env_tester.log_info("Special characters test started.") + + tts_input = TTSTextInput( + request_id="tts_request_special", + text="Hello! How are you? I'm fine, thanks. $100 is 100%.", + text_input_end=True, + ) + data = Data.create("tts_text_input") + data.set_property_from_json(None, tts_input.model_dump_json()) + ten_env_tester.send_data(data) + ten_env_tester.on_start_done() + + def on_data(self, ten_env: TenEnvTester, data) -> None: + name = data.get_name() + if name == "tts_audio_end": + self.audio_end_received = True + ten_env.stop_test() + elif name == "error": + self.error_received = True + ten_env.stop_test() + + +@patch("deepgram_tts.extension.DeepgramTTSClient") +def test_special_characters(MockDeepgramTTSClient): + """Test that special characters are handled correctly.""" + MockDeepgramTTSClient.return_value = _create_mock_client() + + tester = ExtensionTesterSpecialChars() + tester.set_test_mode_single( + "deepgram_tts", + json.dumps( + { + "params": { + "api_key": "test_api_key", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000, + }, + } + ), + ) + + tester.run() + + assert tester.audio_end_received, "tts_audio_end was not received." + assert ( + not tester.error_received + ), "Error should not be received for special chars." diff --git a/docs/ai/L0_repo_card.md b/docs/ai/L0_repo_card.md new file mode 100644 index 0000000000..53c8a56953 --- /dev/null +++ b/docs/ai/L0_repo_card.md @@ -0,0 +1,31 @@ +# TEN Framework — Repo Card + +> Open-source platform for building real-time multimodal AI agents with voice, video, and tool capabilities. + +## Identity + +| Field | Value | +| ------------- | -------------------------------------------------------------------- | +| Repo | `TEN-framework/TEN-Agent` | +| Type | `framework` (SDK-library + API-service + frontend) | +| Language | Python (extensions), Go (API server), TypeScript/React (playground) | +| Deploy Target | Docker container (`ten_agent_dev`), Taskfile-based build | +| Owner | TEN Framework team | +| Last Reviewed | 2026-04-02 | + +## L1 — Summaries + +| File | Purpose | +| ---------------------------------------- | -------------------------------------------------------- | +| [01_setup](L1/01_setup.md) | Docker, .env, ports, health checks, restart procedures | +| [02_architecture](L1/02_architecture.md) | Extensions, graphs, connections, RTC-first design | +| [03_code_map](L1/03_code_map.md) | Directory tree, key files, base classes, 93+ extensions | +| [04_conventions](L1/04_conventions.md) | Naming, Pydantic configs, params pattern, formatting | +| [05_workflows](L1/05_workflows.md) | Create extension, modify graph, test, restart, deploy | +| [06_interfaces](L1/06_interfaces.md) | REST API, connection schemas, base class abstract methods| +| [07_gotchas](L1/07_gotchas.md) | Property tuples, signal handlers, zombies, .env timing | +| [08_security](L1/08_security.md) | API keys, .env, sensitive logging, git hooks | + +## L2 — Deep Dives + +See [L1/deep_dives/_index.md](L1/deep_dives/_index.md) for extended guides referenced by L1 files. diff --git a/docs/ai/L1/01_setup.md b/docs/ai/L1/01_setup.md new file mode 100644 index 0000000000..c6003da202 --- /dev/null +++ b/docs/ai/L1/01_setup.md @@ -0,0 +1,118 @@ +# 01 Setup + +> Environment setup, local development, and quick commands for TEN Framework AI Agents. + +## Prerequisites + +| Requirement | Version / Notes | +| ----------------- | ------------------------------------------------------------ | +| Docker + Compose | Required for container-based development | +| Node.js | LTS v18+ on host; container has Node 22 | +| API Keys | Agora App ID, OpenAI, Deepgram ASR, ElevenLabs TTS (minimum)| +| Hardware | 2+ CPU cores, 4 GB RAM minimum | + +## Docker Container + +```bash +cd /home/ubuntu/ten-framework/ai_agents +docker compose up -d +docker ps | grep ten_agent_dev # Verify running +``` + +Container image: `ghcr.io/ten-framework/ten_agent_build:0.7.14` + +## Environment Variables + +**Single .env file**: `ai_agents/.env` — the ONLY source of environment config. + +| Variable | Purpose | Required | +| ---------------------------- | ---------------------------- | -------- | +| `AGORA_APP_ID` | Agora RTC app identifier | Yes | +| `AGORA_APP_CERTIFICATE` | Agora RTC certificate | No | +| `OPENAI_API_KEY` | LLM provider | Yes | +| `OPENAI_MODEL` | Model name (e.g., `gpt-4o`) | Yes | +| `DEEPGRAM_API_KEY` | ASR provider | Yes | +| `ELEVENLABS_TTS_KEY` | TTS provider | Yes | +| `LOG_STDOUT` | Worker log visibility | Yes (`true`) | +| `SERVER_PORT` | API server port | Yes (`8080`) | +| `WORKERS_MAX` | Max concurrent sessions | Yes (`100`) | +| `WORKER_QUIT_TIMEOUT_SECONDS`| Worker idle timeout | Yes (`60`) | + +See `.env.example` for the complete list. Extensions may require additional keys +(Azure, AWS, Rime, etc.) — check extension README files. + +## Install and Run + +```bash +# 1. Install Python dependencies (NOT persisted across container restarts) +docker exec ten_agent_dev bash -c \ + "cd /app/agents/examples/voice-assistant-advanced/tenapp && \ + bash scripts/install_python_deps.sh" + +# 2. Build and install (5-8 minutes first time) +docker exec ten_agent_dev bash -c \ + "cd /app/agents/examples/voice-assistant-advanced && task install" + +# 3. Start everything (API server + playground + TMAN Designer) +docker exec -d ten_agent_dev bash -c \ + "cd /app/agents/examples/voice-assistant-advanced && \ + task run > /tmp/task_run.log 2>&1" +``` + +**CRITICAL**: Always use `task run` to start — never run `./bin/api` directly. + +## Ports + +| Port | Service | +| ----- | ---------------- | +| 8080 | Go API server | +| 3000 | Playground (Next.js) | +| 49483 | TMAN Designer | + +## Health Checks + +```bash +curl -s http://localhost:8080/health +# {"code":"0","data":null,"msg":"ok"} + +curl -s http://localhost:8080/graphs | jq -r '.data[].name' +# voice_assistant, voice_assistant_heygen, etc. +``` + +## Restart Procedures + +| What Changed | Container? | Server? | Frontend? | +| ------------------------------- | ---------- | ----------------- | ----------------- | +| `property.json` (graphs added) | No | Nuclear restart | Nuclear restart | +| `property.json` (config only) | No | No | No | +| `.env` file | Yes | Yes | No | +| Python extension code | No | Yes | No | +| Go server code | No | Yes + `task install` | No | + +**Nuclear restart** (safest after graph changes): + +```bash +sudo docker exec ten_agent_dev bash -c "pkill -9 -f 'bin/api'; pkill -9 node; pkill -9 bun" +sudo docker exec ten_agent_dev bash -c "rm -f /app/playground/.next/dev/lock" +sleep 2 +sudo docker exec -d ten_agent_dev bash -c \ + "cd /app/agents/examples/voice-assistant-advanced && task run > /tmp/task_run.log 2>&1" +``` + +**After container restart**: always reinstall Python deps, then `task run`. + +**After .env changes**: `docker compose down && docker compose up -d`, reinstall deps, `task run`. + +## Logs + +```bash +# All logs (inside container) +docker exec ten_agent_dev tail -f /tmp/task_run.log + +# Filter by extension or channel +docker exec ten_agent_dev tail -f /tmp/task_run.log | grep --line-buffered "deepgram" +``` + +## Related Deep Dives + +- [Deployment](deep_dives/deployment.md) — Docker Compose, Cloudflare tunnel, Nginx, Grafana monitoring diff --git a/docs/ai/L1/02_architecture.md b/docs/ai/L1/02_architecture.md new file mode 100644 index 0000000000..c35f537d7e --- /dev/null +++ b/docs/ai/L1/02_architecture.md @@ -0,0 +1,142 @@ +# 02 Architecture + +> System design overview: extensions, graphs, connections, and the server-worker model. + +## TEN Ecosystem + +| Component | Purpose | +| --------------------- | ------------------------------------------------------ | +| TEN Framework | Core runtime (C/C++, Go, Python, Node.js bindings) | +| TEN Agent Examples | Pre-built agent configurations (this repo's `ai_agents/`) | +| TEN VAD | Low-latency voice activity detection | +| TEN Turn Detection | Full-duplex dialogue management | +| TEN Portal | Documentation and blog site | + +## Extension System + +Extensions are modular components that process data — ASR, TTS, LLM, tools, RTC, avatars, etc. +Each extension has a lifecycle: + +``` +on_init() → on_start() → [process messages] → on_stop() → on_deinit() +``` + +Every extension contains: + +| File | Purpose | +| ----------------- | ------------------------------------------ | +| `addon.py` | Registration via `@register_addon_as_extension` | +| `extension.py` | Main logic, inherits from a base class | +| `manifest.json` | Metadata, dependencies, API interface | +| `property.json` | Default configuration values | + +**Base classes** (in `ten_ai_base/interface/ten_ai_base/`): + +| Base Class | Use For | +| ----------------------------- | ----------------- | +| `AsyncASRBaseExtension` | Speech-to-text | +| `AsyncTTS2BaseExtension` | Text-to-speech | +| `AsyncLLMBaseExtension` | Chat completion | +| `AsyncLLMToolBaseExtension` | LLM function tools| +| `AsyncExtension` | Generic / custom | + +## Graph-Based Configuration + +Agents are assembled by defining **graphs** in `property.json`. A graph specifies +which extensions run (nodes) and how data flows between them (connections). + +```json +{ + "predefined_graphs": [{ + "name": "voice_assistant", + "auto_start": true, + "graph": { + "nodes": [ + {"type": "extension", "name": "stt", "addon": "deepgram_asr_python", "property": {}}, + {"type": "extension", "name": "llm", "addon": "openai_llm2_python", "property": {}}, + {"type": "extension", "name": "tts", "addon": "elevenlabs_tts2_python", "property": {}} + ], + "connections": [...] + } + }] +} +``` + +## Connection Types + +| Type | Payload | Example | +| ------------- | -------------------- | -------------------------------------------- | +| `cmd` | Named commands | `tool_register`, `on_user_joined`, `flush` | +| `data` | Named data messages | `asr_result`, `text_data`, `tts_text_input` | +| `audio_frame` | PCM audio streams | `pcm_frame` (16-bit, mono, 16/24/48 kHz) | +| `video_frame` | Video streams | Raw video frames for vision/avatar | + +## RTC-First Design + +TEN uses Agora RTC (Real-Time Communication) as the default transport, not WebSockets. + +| Aspect | RTC (default) | WebSocket | +| ---------------- | -------------------------------- | -------------------------- | +| Latency | 50-150ms (UDP-based) | Higher (TCP-based) | +| Codec support | Opus, VP8, VP9, AV1 | Raw PCM only | +| Bandwidth adapt | Built-in adaptation + FEC | Manual implementation | +| Use case | Real-time voice/video | Signaling, configuration | + +WebSockets are used for signaling and configuration; RTC handles the media path. + +## Server-Worker Model + +``` +┌─────────────────┐ ┌──────────────────┐ +│ Go HTTP Server │────▶│ Worker Process │ (one per session) +│ (port 8080) │ │ (tman run start) │ +│ │ │ │ +│ /start → spawn │ │ Loads graph from │ +│ /stop → kill │ │ property.json │ +│ /ping → keep │ │ Runs extensions │ +└─────────────────┘ └──────────────────┘ +``` + +- **POST /start** spawns a worker process for a channel/session +- **POST /stop** terminates the worker +- **POST /ping** keeps the session alive (if timeout != -1) + +## Property Injection + +When `/start` is called, the server auto-injects dynamic values into the graph: + +- `channel_name` → injected into every node that has a `"channel"` property +- `remote_stream_id`, `bot_stream_id`, `token` → injected via `startPropMap` +- `req.Properties[extensionName]` → merged into specific node properties + +This is future-proof: any new extension with a "channel" property automatically +receives the dynamic channel value without code changes. + +## Component Diagram + +``` + Client (Browser/Mobile) + │ + ▼ + ┌──────────────┐ + │ Playground │ Next.js frontend (port 3000) + │ (UI) │ + └──────┬───────┘ + │ REST API + ▼ + ┌──────────────┐ ┌──────────────────────────────────┐ + │ Go Server │──spawn─▶│ Worker Process │ + │ (port 8080) │ │ ┌─────┐ ┌─────┐ ┌─────┐ │ + │ │ │ │ ASR │─▶│ LLM │─▶│ TTS │ │ + │ │ │ └──┬──┘ └─────┘ └──┬──┘ │ + └──────────────┘ │ │ │ │ + │ ┌──┴───────────────────┴──┐ │ + │ │ Agora RTC │ │ + │ └─────────────────────────┘ │ + └──────────────────────────────────┘ +``` + +## Related Deep Dives + +- [Server Architecture](deep_dives/server_architecture.md) — Go server internals, property injection pipeline +- [Graph Configuration](deep_dives/graph_configuration.md) — Node schema, connection wiring, parallel routing diff --git a/docs/ai/L1/03_code_map.md b/docs/ai/L1/03_code_map.md new file mode 100644 index 0000000000..dd60723726 --- /dev/null +++ b/docs/ai/L1/03_code_map.md @@ -0,0 +1,117 @@ +# 03 Code Map + +> Directory tree, module responsibilities, and key file locations. + +## Top-Level Structure + +All AI agent development happens inside `ai_agents/`: + +``` +ai_agents/ +├── agents/ +│ ├── ten_packages/ +│ │ ├── extension/ # 93+ extensions (ASR, TTS, LLM, tools, avatar) +│ │ └── system/ # Core runtime packages +│ │ ├── ten_ai_base/ # Base classes and API interface definitions +│ │ ├── ten_runtime_python/ +│ │ └── ten_runtime_go/ +│ ├── examples/ # 24+ example agent configurations +│ │ ├── voice-assistant/ +│ │ ├── voice-assistant-advanced/ +│ │ ├── voice-assistant-realtime/ +│ │ ├── voice-assistant-video/ +│ │ ├── doodler/ +│ │ └── ... +│ ├── integration_tests/ # Test frameworks +│ │ ├── asr_guarder/ # ASR integration tests +│ │ └── tts_guarder/ # TTS integration tests +│ └── scripts/ # Build and packaging scripts +├── server/ # Go API server +│ ├── main.go +│ └── internal/ +│ ├── http_server.go # REST endpoints, property injection +│ └── config.go # Parameter mapping (startPropMap) +├── playground/ # Next.js frontend UI (port 3000) +│ └── src/ # React components +├── esp32-client/ # ESP32 hardware client +├── Taskfile.yml # Root-level build/test tasks +├── docker-compose.yml # Container config +├── .env # Environment variables (single source) +└── .env.example # Template with all variables +``` + +Other repo-root directories: `core/` (C runtime), `packages/` (example/core extensions), +`docs/` (framework docs), `tools/` (Grafana monitoring, profilers). + +## Extension Categories + +| Category | Count | Examples | +| --------- | ----- | ----------------------------------------------------------- | +| ASR | 10+ | `deepgram_asr_python`, `azure_asr_python`, `aws_asr_python` | +| TTS | 15+ | `deepgram_tts`, `elevenlabs_tts2_python`, `cartesia_tts` | +| LLM | 8+ | `openai_llm2_python`, `gemini_llm2_python`, `bedrock_llm_python` | +| Avatar | 5+ | `heygen_avatar_python`, `anam_avatar_python` | +| Tools | 8+ | `bingsearch_tool_python`, `vision_tool_python` | +| Transport | 3+ | `agora_rtc`, `websocket_server`, `http_server_python` | +| Other | 10+ | `message_collector2`, `ten_vad_python`, `mcp_client_python` | + +## Extension File Structure + +Every extension follows this layout: + +| File | Purpose | +| ------------------ | ---------------------------------------------- | +| `__init__.py` | Package marker | +| `addon.py` | `@register_addon_as_extension` registration | +| `extension.py` | Main logic, inherits from base class | +| `config.py` | Pydantic config model (optional but common) | +| `manifest.json` | Metadata, dependencies, API interface imports | +| `property.json` | Default config values with `${env:VAR}` syntax | +| `requirements.txt` | Python dependencies | +| `README.md` | Usage documentation (often multilingual) | +| `tests/` | Standalone tests with `bin/start` entry point | + +## Base Classes + +Located in example tenapp directories under `ten_packages/system/ten_ai_base/interface/ten_ai_base/`: + +| File | Class | Purpose | +| ---------- | ---------------------------- | -------------------------- | +| `asr.py` | `AsyncASRBaseExtension` | Speech recognition | +| `tts.py` | `AsyncTTSBaseExtension` | Text-to-speech (basic) | +| `tts2.py` | `AsyncTTS2BaseExtension` | Text-to-speech (advanced) | +| `llm.py` | `AsyncLLMBaseExtension` | Language model completion | +| `llm2.py` | `AsyncLLM2BaseExtension` | Language model v2 | +| `llm_tool.py` | `AsyncLLMToolBaseExtension` | LLM function calling tools | +| `mllm.py` | `AsyncMLLMBaseExtension` | Multimodal LLM | + +## API Interface Definitions + +Standard interfaces in `ten_ai_base/api/`: + +| File | Defines | +| ----------------------- | --------------------------------- | +| `asr-interface.json` | ASR data/cmd/audio_frame schemas | +| `tts-interface.json` | TTS data/cmd/audio_frame schemas | +| `llm-interface.json` | LLM data/cmd schemas | +| `mllm-interface.json` | Multimodal LLM schemas | + +Extensions reference these via `manifest.json`: +```json +{"api": {"interface": [{"import_uri": "../../system/ten_ai_base/api/tts-interface.json"}]}} +``` + +## Key Files Quick Reference + +| When working on... | Look at | +| -------------------------- | -------------------------------------------------- | +| New extension | Similar extension in `agents/ten_packages/extension/` | +| API interface changes | `ten_ai_base/api/*.json` | +| Graph configuration | `agents/examples/*/tenapp/property.json` | +| Server endpoints | `server/internal/http_server.go` | +| Build/test tasks | `Taskfile.yml` (root) and per-example | +| Test setup | `agents/ten_packages/extension/*/tests/bin/start` | + +## Related Deep Dives + +- [Extension Development](deep_dives/extension_development.md) — Full creation guide with base class details diff --git a/docs/ai/L1/04_conventions.md b/docs/ai/L1/04_conventions.md new file mode 100644 index 0000000000..41f11901f8 --- /dev/null +++ b/docs/ai/L1/04_conventions.md @@ -0,0 +1,138 @@ +# 04 Conventions + +> Coding patterns, naming, configuration, and formatting standards. + +## Naming Conventions + +| Item | Pattern | Example | +| --------------- | -------------------------------------- | -------------------------- | +| Extension dir | `__python` | `deepgram_asr_python` | +| Addon name | Same as directory name | `deepgram_asr_python` | +| Example dir | `voice-assistant-` | `voice-assistant-realtime` | +| Config class | `Config(BaseModel)` | `DeepgramTTSConfig` | +| Client class | `Client` | `DeepgramTTSClient` | + +## Addon Registration + +Every extension must register via decorator in `addon.py`: + +```python +from ten_runtime import Addon, register_addon_as_extension, TenEnv + +@register_addon_as_extension("deepgram_asr_python") +class DeepgramASRExtensionAddon(Addon): + def on_create_instance(self, ten: TenEnv, addon_name: str, context) -> None: + ten.on_create_instance_done(DeepgramASRExtension(addon_name), context) +``` + +The decorator name **must match** the `addon` field in `property.json` graph nodes. + +## Base Class Selection + +| Need | Base Class | Key Abstract Methods | +| ----------------------- | ----------------------------- | ------------------------------------- | +| Speech-to-text | `AsyncASRBaseExtension` | `vendor()`, `start_connection()`, `send_audio()`, `finalize()` | +| Text-to-speech (HTTP) | `AsyncTTS2HttpExtension` | `vendor()`, `request_tts()`, `synthesize_audio_sample_rate()` | +| Text-to-speech (WS) | `AsyncTTS2BaseExtension` | `vendor()`, `request_tts()`, `cancel_tts()` | +| Chat completion | `AsyncLLMBaseExtension` | `on_call_chat_completion()`, `on_data_chat_completion()` | +| LLM function tool | `AsyncLLMToolBaseExtension` | `get_tool_metadata()`, `run_tool()` | +| Generic / custom | `AsyncExtension` | `on_cmd()`, `on_data()`, etc. | + +## Pydantic Configuration + +Extensions use Pydantic models for config validation: + +```python +from pydantic import BaseModel, Field + +class DeepgramTTSConfig(BaseModel): + api_key: str = "" + model: str = "aura-2-theia-en" + sample_rate: int = 24000 + params: dict[str, Any] = Field(default_factory=dict) +``` + +Config is loaded from property.json in `on_init()`: +```python +config_json, _ = await ten_env.get_property_to_json("") +self.config = DeepgramTTSConfig(**json.loads(config_json)) +``` + +## Environment Variable Syntax + +In `property.json`, reference env vars: + +| Syntax | Behavior | +| --------------------- | --------------------------------------- | +| `${env:VAR_NAME}` | Required — error if missing | +| `${env:VAR_NAME\|}` | Optional — empty string if missing | +| `${env:VAR_NAME\|default}` | Optional — uses default if missing | + +```json +{"api_key": "${env:DEEPGRAM_API_KEY}", "region": "${env:AZURE_REGION|}"} +``` + +## Params Dict Pattern + +Extensions using HTTP/WebSocket services store all config in a `params` dictionary: + +1. **Store** `api_key` inside `params` dict in property.json and config +2. **Extract** for authentication headers in the client constructor +3. **Strip** from params **only when creating the HTTP request payload** + +```python +# In client constructor — extract for auth +self.api_key = config.params.get("api_key", "") +self.headers = {"Authorization": f"Bearer {self.api_key}"} + +# In request method — strip before sending +payload = {**self.config.params} +payload.pop("api_key", None) +``` + +## Sensitive Data Logging + +Implement `to_str()` to encrypt sensitive fields before logging: + +```python +def to_str(self, sensitive_handling: bool = True) -> str: + if not sensitive_handling: + return f"{self}" + config = copy.deepcopy(self) + if config.params and "api_key" in config.params: + config.params["api_key"] = utils.encrypt(config.params["api_key"]) + return f"{config}" +``` + +## Logging + +- Use `ten_env.log_info()`, `ten_env.log_warn()`, `ten_env.log_error()`, `ten_env.log_debug()` +- Categories: `LOG_CATEGORY_KEY_POINT` (lifecycle events), `LOG_CATEGORY_VENDOR` (vendor status) +- All output goes to `/tmp/task_run.log` inside the container + +## Import Convention + +```python +# Correct (v0.11+) +from ten_runtime import Addon, register_addon_as_extension, TenEnv + +# Wrong (old v0.8.x — will not work) +from ten import Addon +``` + +## Formatting + +- **Black** formatter with `--line-length 80` +- Run: `task format` (from `ai_agents/`) +- Check: `task check` +- Excludes: `third_party/`, `http_server_python/`, `ten_packages/system` + +## Design Principles + +- **YAGNI**: Only implement what is needed now, not what might be needed later +- **KISS**: Prefer simple solutions; three similar lines > premature abstraction +- **No git-ignored files**: Never modify auto-generated files (manifest-lock.json, out/, .ten/, bin/) + +## Related Deep Dives + +- [Extension Development](deep_dives/extension_development.md) — Full creation guide with implementation walkthroughs diff --git a/docs/ai/L1/05_workflows.md b/docs/ai/L1/05_workflows.md new file mode 100644 index 0000000000..b8ad2b729b --- /dev/null +++ b/docs/ai/L1/05_workflows.md @@ -0,0 +1,181 @@ +# 05 Workflows + +> Step-by-step guides for common development tasks. + +## Create a New TTS / ASR / LLM Extension + +**Fastest path**: Copy a similar extension and adapt it. + +| Type | Copy From | Base Class | +| ----------- | -------------------------- | --------------------------- | +| TTS (HTTP) | `rime_http_tts` | `AsyncTTS2HttpExtension` | +| TTS (WS) | `deepgram_tts` | `AsyncTTS2BaseExtension` | +| ASR | `deepgram_asr_python` | `AsyncASRBaseExtension` | +| LLM | `openai_llm2_python` | `AsyncLLMBaseExtension` | + +```bash +cp -r agents/ten_packages/extension/deepgram_tts agents/ten_packages/extension/my_vendor_tts +``` + +Then: +1. Rename addon decorator, class names, `manifest.json` `name` field +2. Implement the abstract methods for your vendor API +3. Create `tests/configs/` with required config files (see below) +4. Run guarder tests: `task tts-guarder-test EXTENSION=my_vendor_tts` +5. Run formatter: `task format` + +**Required test config files** for TTS: `property.json`, `property_basic_audio_setting1.json`, +`property_basic_audio_setting2.json`, `property_dump.json`, `property_miss_required.json`, +`property_invalid.json` + +**Required test config files** for ASR: `property_en.json`, `property_zh.json`, +`property_invalid.json`, `property_dump.json` + +For full walkthrough with code and all 15/10 test details, see +[Extension Development](deep_dives/extension_development.md) and [Testing](deep_dives/testing.md). + +## Add Extension to a Graph + +1. **Add node** to `predefined_graphs[].graph.nodes[]` in the example's `tenapp/property.json`: + ```json + {"type": "extension", "name": "my_tts", "addon": "my_tts_python", + "extension_group": "tts_group", + "property": {"api_key": "${env:MY_API_KEY}"}} + ``` + +2. **Add connections** — wire data flow between extensions: + ```json + {"extension": "my_tts", + "data": [{"name": "tts_text_input", "source": [{"extension": "main"}]}], + "audio_frame": [{"name": "pcm_frame", "dest": [{"extension": "agora_rtc"}]}]} + ``` + +3. **Add dependency** to example `tenapp/manifest.json`: + ```json + {"type": "extension", "name": "my_tts_python", "version": "0.1.0"} + ``` + +4. **Install** (use `task install`, not just `tman install` — the latter can wipe `bin/main`): + ```bash + docker exec ten_agent_dev bash -c "cd /app/agents/examples/ && task install" + ``` + +5. **Nuclear restart** (required when graphs are added/removed): + ```bash + sudo docker exec ten_agent_dev bash -c \ + "pkill -9 -f 'bin/api'; pkill -9 -f bun; pkill -9 -f node; pkill -9 -f next-server; pkill -9 -f tman" + sudo docker exec ten_agent_dev bash -c "rm -f /app/playground/.next/dev/lock" + sleep 30 # wait for port 3000 TIME_WAIT to clear + sudo docker exec -d ten_agent_dev bash -c \ + "cd /app/agents/examples/ && task run > /tmp/task_run.log 2>&1" + ``` + +See [Graph Configuration](deep_dives/graph_configuration.md) for connection types and routing patterns. + +**For complex multi-graph setups** (A/B testing vendors, avatar variants), use +`rebuild_property.py` instead of hand-editing. See +[Generating property.json](deep_dives/graph_configuration.md#generating-propertyjson-with-rebuild_propertypy). + +## Customize the Main Extension + +The "main" extension orchestrates agent behavior (greetings, tool routing, interruption). +Three implementation variants exist: + +| Variant | File | Use Case | +| -------------------- | --------------------- | ------------------------------- | +| Python Cascade | `main_python_cascade` | ASR → LLM → TTS pipeline | +| Python Realtime V2V | `main_python_realtime`| OpenAI Realtime API (voice-to-voice) | +| Node.js Cascade | `main_nodejs_cascade` | TypeScript implementation | + +Modify `on_data()` to change event routing, `on_cmd()` for tool handling. + +## Run Tests + +```bash +# All tests +docker exec ten_agent_dev bash -c "cd /app && task test" + +# Single extension (with dependency install) +docker exec ten_agent_dev bash -c \ + "cd /app && task test-extension EXTENSION=agents/ten_packages/extension/deepgram_tts" + +# Single extension (skip install — faster) +docker exec ten_agent_dev bash -c \ + "cd /app && task test-extension-no-install EXTENSION=agents/ten_packages/extension/deepgram_tts" + +# ASR guarder integration tests +docker exec ten_agent_dev bash -c \ + "cd /app && task asr-guarder-test EXTENSION=azure_asr_python" + +# TTS guarder integration tests +docker exec ten_agent_dev bash -c \ + "cd /app && task tts-guarder-test EXTENSION=deepgram_tts" +``` + +See [Testing](deep_dives/testing.md) for test structure and debugging. + +## Restart After Changes + +| What Changed | Action | +| ------------------------------- | ---------------------------------------------------- | +| `property.json` (graphs added) | Nuclear restart (kill all, remove lock, task run) | +| `property.json` (config only) | No restart needed (loaded per session) | +| `.env` | `docker compose down && docker compose up -d` + deps | +| Python code | Restart server only | +| Go code | `task install` then restart server | +| Container restart | Reinstall Python deps, then `task run` | + +## Build and Install + +```bash +# Full install (first time or after adding extensions) — ALWAYS prefer this +docker exec ten_agent_dev bash -c \ + "cd /app/agents/examples/ && task install" + +# Install Python deps only +docker exec ten_agent_dev bash -c \ + "cd /app/agents/examples//tenapp && bash scripts/install_python_deps.sh" + +# Install extension dependencies only (creates symlinks) — WARNING: can wipe bin/main +docker exec ten_agent_dev bash -c \ + "cd /app/agents/examples//tenapp && tman install" +``` + +## Update Extension Code in Running Container + +When iterating on extension code locally: + +```bash +# Copy updated files into the container (use /. to avoid nested dirs) +sudo docker cp ./agents/ten_packages/extension/my_ext/. \ + ten_agent_dev:/app/agents/ten_packages/extension/my_ext/ + +# Verify symlink exists in the example's tenapp +sudo docker exec ten_agent_dev bash -c \ + "ls -la /app/agents/examples//tenapp/ten_packages/extension/my_ext" + +# If missing, create it manually +sudo docker exec ten_agent_dev bash -c \ + "ln -sf /app/agents/ten_packages/extension/my_ext \ + /app/agents/examples//tenapp/ten_packages/extension/my_ext" + +# Then nuclear restart +``` + +## Pre-Commit Checks + +```bash +# Format Python code (Black, line-length 80) +docker exec ten_agent_dev bash -c "cd /app && task format" + +# Check formatting without modifying +docker exec ten_agent_dev bash -c "cd /app && task check" +``` + +Pre-commit hooks validate: API key patterns, Black formatting, conventional commit messages. + +## Related Deep Dives + +- [Extension Development](deep_dives/extension_development.md) — Full extension creation with code examples +- [Graph Configuration](deep_dives/graph_configuration.md) — Connection wiring and routing patterns +- [Testing](deep_dives/testing.md) — Test infrastructure, guarder tests, debugging diff --git a/docs/ai/L1/06_interfaces.md b/docs/ai/L1/06_interfaces.md new file mode 100644 index 0000000000..d87b91514f --- /dev/null +++ b/docs/ai/L1/06_interfaces.md @@ -0,0 +1,150 @@ +# 06 Interfaces + +> REST API contracts, graph connection schemas, and base class abstract methods. + +## REST API Endpoints + +The Go server (`server/internal/http_server.go`) exposes: + +| Endpoint | Method | Purpose | Key Fields | +| -------------------- | ------ | ------------------------------------ | --------------------------------- | +| `/health` | GET | Health check | Returns `{"code":"0"}` | +| `/graphs` | GET | List available graphs | Returns `data[].name` | +| `/start` | POST | Start agent session | `graph_name`, `channel_name` | +| `/stop` | POST | Stop agent session | `channel_name` | +| `/ping` | POST | Keep session alive | `channel_name` | +| `/list` | GET | List active sessions | Returns worker list | +| `/token/generate` | POST | Generate Agora RTC token | `channel_name`, `uid` | + +### POST /start Request Body + +```json +{ + "request_id": "uuid", + "channel_name": "test_channel", + "user_uid": 176573, + "graph_name": "voice_assistant", + "properties": { + "openai_llm2_python": {"model": "gpt-4o-mini"} + }, + "timeout": 60 +} +``` + +- `properties` — per-extension overrides merged into graph node properties +- `timeout` — seconds of inactivity before auto-stop (-1 = never) + +## Graph Connection Types + +Connections in `property.json` define data flow between extensions: + +### Command Connections (`cmd`) + +```json +{"extension": "main", "cmd": [ + {"name": "tool_register", "dest": [{"extension": "llm"}]}, + {"name": "on_user_joined", "source": [{"extension": "agora_rtc"}]} +]} +``` + +Common commands: `tool_register`, `on_user_joined`, `flush`, `chat_completion_call`, +`update_configs` + +### Data Connections (`data`) + +```json +{"extension": "llm", "data": [ + {"name": "text_data", "source": [{"extension": "main"}]}, + {"name": "text_data", "dest": [{"extension": "tts"}]} +]} +``` + +Common data: `asr_result`, `text_data`, `tts_text_input`, `tts_audio_start`, +`tts_audio_end`, `error` + +### Audio Frame Connections (`audio_frame`) + +```json +{"extension": "agora_rtc", "audio_frame": [ + {"name": "pcm_frame", "dest": [{"extension": "stt"}]} +]} +``` + +### Video Frame Connections (`video_frame`) + +```json +{"extension": "agora_rtc", "video_frame": [ + {"name": "video_frame", "dest": [{"extension": "vision"}]} +]} +``` + +## Base Class Abstract Methods + +### ASR (`AsyncASRBaseExtension`) + +| Method | Returns | Purpose | +| --------------------------- | --------- | -------------------------------- | +| `vendor()` | `str` | Vendor name (e.g., "deepgram") | +| `start_connection()` | `None` | Connect to ASR service | +| `stop_connection()` | `None` | Disconnect | +| `send_audio(frame)` | `bool` | Send audio frame to service | +| `finalize()` | `None` | Drain pending audio | +| `is_connected()` | `bool` | Connection status check | +| `input_audio_sample_rate()` | `int` | Expected sample rate (e.g., 16000)| + +**Output helpers**: `send_asr_result()`, `send_asr_error()`, `send_asr_finalize_end()`, +`send_connect_delay_metrics()`, `send_vendor_metrics()` + +### TTS (`AsyncTTS2BaseExtension`) + +| Method | Returns | Purpose | +| ------------------------------- | -------- | ------------------------------------ | +| `vendor()` | `str` | Vendor name (e.g., "elevenlabs") | +| `request_tts(tts_text_input)` | `AsyncIterator` | Generate audio from text | +| `cancel_tts()` | `None` | Handle flush/cancellation | +| `synthesize_audio_sample_rate()`| `int` | Output sample rate (e.g., 24000) | +| `synthesize_audio_channels()` | `int` | Channel count (default: 1) | +| `synthesize_audio_sample_width()`| `int` | Bytes per sample (default: 2) | + +**Output helpers**: `send_tts_audio_data()`, `send_tts_audio_start()`, `send_tts_audio_end()`, +`send_tts_error()`, `send_tts_ttfb_metrics()`, `send_tts_text_result()` + +**State machine**: QUEUED → PROCESSING → FINALIZING → COMPLETED (per request) + +### LLM (`AsyncLLMBaseExtension`) + +| Method | Returns | Purpose | +| ------------------------------- | ------- | -------------------------------- | +| `on_call_chat_completion()` | varies | Handle sync command requests | +| `on_data_chat_completion()` | varies | Handle stream-based data input | +| `on_tools_update(tool_metadata)`| `None` | Handle new tool registration | + +**Tool flow**: Extensions register tools via `CMD_TOOL_REGISTER` → LLM stores in +`available_tools` → LLM calls tools during completion → results returned. + +## Manifest API Interface + +Extensions declare their API interface in `manifest.json`: + +```json +{ + "api": { + "interface": [ + {"import_uri": "../../system/ten_ai_base/api/tts-interface.json"} + ], + "property": { + "api_key": {"type": "string"}, + "model": {"type": "string"}, + "sample_rate": {"type": "int32"} + } + } +} +``` + +Interface JSON files define the standard cmd/data/audio_frame schemas for each extension type. + +## Related Deep Dives + +- [Extension Development](deep_dives/extension_development.md) — Implementing abstract methods +- [Server Architecture](deep_dives/server_architecture.md) — Endpoint handlers and property injection +- [Graph Configuration](deep_dives/graph_configuration.md) — Full connection wiring examples diff --git a/docs/ai/L1/07_gotchas.md b/docs/ai/L1/07_gotchas.md new file mode 100644 index 0000000000..e61011e489 --- /dev/null +++ b/docs/ai/L1/07_gotchas.md @@ -0,0 +1,235 @@ +# 07 Gotchas + +> Critical pitfalls, tribal knowledge, and troubleshooting. + +## CRITICAL: Property Getters Return Tuples + +All `get_property_*()` methods return `(value, error_or_none)`, not the raw value. + +```python +# WRONG — causes TypeError +threshold = await ten_env.get_property_float("threshold") +if threshold > 0.5: # TypeError: '>' not supported between 'float' and 'tuple' + +# CORRECT — extract from tuple +threshold_result = await ten_env.get_property_float("threshold") +threshold = threshold_result[0] if isinstance(threshold_result, tuple) else threshold_result +``` + +This applies to `get_property_string()`, `get_property_int()`, `get_property_float()`, +`get_property_bool()`. Always extract `[0]`. + +## CRITICAL: Signal Handlers Forbidden + +Extensions run in worker threads. Signal handlers only work in the main thread. + +```python +# WRONG — raises ValueError: signal only works in main thread +signal.signal(signal.SIGTERM, handler) +atexit.register(cleanup) + +# CORRECT — use extension lifecycle +async def on_stop(self, ten_env): + await self.cleanup() +``` + +## CRITICAL: Always Use `task run` + +Never start the server with `./bin/api` or `./bin/main` directly. +`task run` sets the correct PYTHONPATH and starts all services together +(API server + playground + TMAN Designer). + +## Zombie Worker Processes + +Worker processes (`bin/main`) run on the **host machine**, not inside Docker. +They survive container restarts and server restarts. + +```bash +# Check for zombies +ps -elf | grep 'bin/main' | grep -v grep + +# Kill them +ps -elf | grep 'bin/main' | grep -v grep | awk '{print $4}' | xargs -r sudo kill -9 +``` + +Always kill zombies before restarting the server. + +## .env Loaded at Container Startup Only + +Editing `.env` while the container is running has **no effect**. You must: + +```bash +cd /home/ubuntu/ten-framework/ai_agents +docker compose down && docker compose up -d +# Then reinstall Python deps and task run +``` + +## Node.js Version for Playground + +Playground requires Node.js >= 20.9.0. The host machine may have an older version. +Always run playground from **inside the container** (has Node 22): + +```bash +# WRONG: running from host with Node 18 +cd playground && npm run dev # Fails + +# CORRECT: task run starts playground inside container automatically +``` + +## Next.js Lock File + +After crashes, `.next/dev/lock` becomes stale, preventing restart: + +```bash +sudo docker exec ten_agent_dev bash -c "rm -f /app/playground/.next/dev/lock" +``` + +Always use nuclear restart after playground crashes. + +## Python Deps Not Persisted + +Python dependencies are installed into the container's filesystem and are lost +on container restart. Always reinstall after `docker compose down && up`: + +```bash +docker exec ten_agent_dev bash -c \ + "cd /app/agents/examples/voice-assistant-advanced/tenapp && bash scripts/install_python_deps.sh" +``` + +## tman Install Creates Symlinks + +Never manually create symlinks with `ln -s` for extensions. +Always use `tman install` which resolves dependencies and creates correct links: + +```bash +docker exec ten_agent_dev bash -c "cd /app/agents/examples//tenapp && tman install" +``` + +**Important:** If `tman install` doesn't create a symlink for a new extension (e.g., after +adding it to `manifest.json`), create it manually as a fallback: + +```bash +sudo docker exec ten_agent_dev bash -c \ + "ln -sf /app/agents/ten_packages/extension/my_ext \ + /app/agents/examples//tenapp/ten_packages/extension/my_ext" +``` + +## docker cp Creates Nested Directories + +When using `docker cp` to update extension code in the container, beware of +trailing slashes creating nested directories: + +```bash +# WRONG — creates /app/.../deepgram_tts/deepgram_tts/ (nested) +sudo docker cp ./deepgram_tts/ container:/app/.../deepgram_tts/ + +# CORRECT — copy contents into existing directory +sudo docker cp ./deepgram_tts/. container:/app/.../deepgram_tts/ +``` + +If you see `ModuleNotFoundError: No module named 'ten_packages.extension.X'` +after a `docker cp`, check for nested directories inside the extension folder. + +## tman install Can Wipe bin/main + +Running `tman install` when system dependencies have newer versions will replace +the runtime packages, which **deletes `bin/main`**. You must run the full +`task install` (not just `tman install`) to rebuild it: + +```bash +# This alone can break things if runtime versions changed: +docker exec ten_agent_dev bash -c "cd /app/.../tenapp && tman install" + +# This is safe — rebuilds bin/main after tman install: +docker exec ten_agent_dev bash -c "cd /app/agents/examples/ && task install" +``` + +Signs: Worker fails with `bin/main: No such file or directory` in logs. + +## Audio Routing: Split at Source Only + +When routing audio to multiple destinations, the split must happen at the +source node (e.g., `agora_rtc`), not at intermediate nodes. Splitting from +intermediate nodes can cause crashes. + +```json +// CORRECT: agora_rtc sends pcm_frame to both stt AND vad +{"extension": "agora_rtc", "audio_frame": [ + {"name": "pcm_frame", "dest": [{"extension": "stt"}, {"extension": "vad"}]} +]} +``` + +## Frontend Caches Graph List + +The playground caches the `/graphs` API response. When adding or removing graphs +from `property.json`, a nuclear restart is required — simple server restart +is not enough. + +## Manifest Module Name Must Match + +The `name` field in extension `manifest.json` must exactly match the `addon` +field used in graph nodes in `property.json`. Mismatches cause silent failures. + +## Apple Silicon Docker + +Docker containers may need Rosetta for x86 images on Apple Silicon Macs. +Enable in Docker Desktop: Settings → General → Use Rosetta for x86_64/amd64 emulation. + +## Windows Line Endings + +Before cloning on Windows, configure git to preserve Unix line endings: + +```bash +git config --global core.autocrlf false +``` + +## Nuclear Restart Recipe + +When in doubt, use the nuclear option. **Must kill `next-server` too** — it +holds port 3000 even after its parent `node` process is killed: + +```bash +# 1. Kill EVERYTHING (including next-server which holds port 3000) +sudo docker exec ten_agent_dev bash -c \ + "pkill -9 -f 'bin/api'; pkill -9 -f bun; pkill -9 -f node; pkill -9 -f next-server; pkill -9 -f tman" + +# 2. Clean up stale files +sudo docker exec ten_agent_dev bash -c "rm -f /app/playground/.next/dev/lock" + +# 3. Wait for port 3000 TIME_WAIT to clear (critical!) +# If Next.js can't bind port 3000, it silently starts on 3001/3002 which +# isn't exposed by Docker — the frontend appears down. +sleep 30 # or check: docker exec ten_agent_dev bash -c "cat /proc/net/tcp6 | grep ':0BB8' | wc -l" + +# 4. Start +sudo docker exec -d ten_agent_dev bash -c \ + "cd /app/agents/examples/voice-assistant && task run > /tmp/task_run.log 2>&1" + +# 5. Verify (wait ~12s for startup) +sleep 12 +sudo docker exec ten_agent_dev bash -c \ + "curl -s http://localhost:8080/health && curl -s -o /dev/null -w ' Frontend:%{http_code}' http://localhost:3000/" +``` + +**Verify the logs** — check Next.js started on port 3000 (not 3001+): +```bash +sudo docker exec ten_agent_dev bash -c "strings /tmp/task_run.log | grep -E 'Local:|Port|Ready|Error'" +``` + +If you see `Port 3000 is in use`, find and kill the process holding it: +```bash +sudo docker exec ten_agent_dev bash -c \ + "for pid in /proc/[0-9]*/fd/*; do \ + link=\$(readlink \$pid 2>/dev/null); \ + echo \"\$link\" | grep -q socket: && \ + inode=\$(echo \$link | grep -oP '\\d+') && \ + grep -q \$inode /proc/net/tcp6 2>/dev/null && \ + grep \$inode /proc/net/tcp6 | grep -q ':0BB8' && \ + echo PID=\$(echo \$pid | cut -d/ -f3) && break; \ + done" +``` + +## Related Deep Dives + +- [Deployment](deep_dives/deployment.md) — Production setup, persistent startup +- [Server Architecture](deep_dives/server_architecture.md) — Worker lifecycle, session management diff --git a/docs/ai/L1/08_security.md b/docs/ai/L1/08_security.md new file mode 100644 index 0000000000..5efef433c8 --- /dev/null +++ b/docs/ai/L1/08_security.md @@ -0,0 +1,88 @@ +# 08 Security + +> Secret management, input validation, and repository hygiene. + +## API Key Management + +- **Single source**: All API keys live in `ai_agents/.env` (git-ignored) +- **Never hardcode** keys in `property.json` — use `${env:VAR_NAME}` substitution +- **Persistent storage**: Keep a copy of keys outside the repo (e.g., `~/api_keys.txt`) + so branch switches don't lose them +- See `.env.example` for the complete variable catalog + +## Environment Variable Substitution + +In `property.json`, reference secrets via: + +```json +{ + "api_key": "${env:DEEPGRAM_API_KEY}", + "region": "${env:AZURE_REGION|eastus}" +} +``` + +| Syntax | Behavior | +| ------------------------- | ---------------------------- | +| `${env:VAR}` | Required — error if missing | +| `${env:VAR\|}` | Optional — empty if missing | +| `${env:VAR\|default}` | Optional — default if missing| + +## Sensitive Data in Logs + +Extensions must encrypt sensitive fields before logging: + +```python +def to_str(self, sensitive_handling: bool = True) -> str: + config = copy.deepcopy(self) + if config.params and "api_key" in config.params: + config.params["api_key"] = utils.encrypt(config.params["api_key"]) + return f"{config}" +``` + +Never log raw API keys, tokens, or credentials. + +## Server-Side Protections + +The Go server (`http_server.go`) implements: + +- **Path traversal prevention**: Ignores client-requested `tenapp_dir`, always uses + the launch-configured directory +- **Channel name sanitization**: Validated before use in file operations +- **Safe type conversion**: Property values are type-checked during merge +- **Recursive property merge**: Prevents injection via nested config overrides + +## Pre-Commit Hooks + +| Hook | What It Checks | +| ------------- | ----------------------------------------------------------- | +| `pre-commit` | Scans staged files for API key patterns (`API_KEY.*=[A-Za-z0-9]{20,}`) | +| `pre-commit` | Black formatting compliance (line-length 80) | +| `commit-msg` | Conventional commit format, blocks AI tool name references | + +## Git-Ignored Files + +These are auto-generated — never modify or commit them: + +| Pattern | Source | +| ---------------------- | ------------------------- | +| `manifest-lock.json` | `tman` dependency resolve | +| `compile_commands.json`| Build system | +| `BUILD.gn`, `.gn` | Build configuration | +| `out/`, `build/` | Build output | +| `.ten/` | TEN runtime files | +| `bin/main`, `bin/worker`| Compiled binaries | +| `.release/` | Release packaging | +| `node_modules/` | JS dependencies | +| `.env` | Environment secrets | + +## Files That Should Never Be Committed + +- `.env` (API keys and secrets) +- `*.pem` (certificates) +- `*.pcm` (audio dumps) +- Credential files, tokens, session data + +## Related Deep Dives + +- [Deployment](deep_dives/deployment.md) — Production security considerations +- [Server Architecture](deep_dives/server_architecture.md) — Server-side validation details diff --git a/docs/ai/L1/deep_dives/_index.md b/docs/ai/L1/deep_dives/_index.md new file mode 100644 index 0000000000..34502c601f --- /dev/null +++ b/docs/ai/L1/deep_dives/_index.md @@ -0,0 +1,9 @@ +# Deep Dives Index + +| Document | Summary | Load When | +| -------------------------------------------------------- | ------------------------------------------------ | ------------------------------------------------ | +| [extension_development.md](extension_development.md) | Full extension creation guide, base classes, test configs, pre-submission checklist | Creating a new TTS/ASR/LLM extension | +| [graph_configuration.md](graph_configuration.md) | Graph nodes, connections, routing, property.json | Modifying graphs or wiring extensions together | +| [testing.md](testing.md) | All 15 TTS + 10 ASR guarder tests, pass criteria, config files, debugging | Running or debugging tests for an extension | +| [deployment.md](deployment.md) | Docker, Cloudflare, Nginx, Grafana monitoring | Deploying to production or setting up monitoring | +| [server_architecture.md](server_architecture.md) | Go server, property injection, worker lifecycle | Understanding server internals or debugging | diff --git a/docs/ai/L1/deep_dives/deployment.md b/docs/ai/L1/deep_dives/deployment.md new file mode 100644 index 0000000000..3ffae48c77 --- /dev/null +++ b/docs/ai/L1/deep_dives/deployment.md @@ -0,0 +1,206 @@ +# Deployment + +> **When to Read This:** Load this document when you are deploying to production, +> setting up HTTPS access, configuring monitoring, or ensuring services persist +> across session closures. + +## Docker Compose Setup + +The development container is defined in `ai_agents/docker-compose.yml`: + +```yaml +services: + ten_agent_dev: + image: ghcr.io/ten-framework/ten_agent_build:0.7.14 + container_name: ten_agent_dev + ports: + - "49483:49483" # TMAN Designer + - "3000:3000" # Playground + - "8000-9001:8000-9001" # API + worker range + volumes: + - .:/app + environment: + - LOG_PATH=${LOG_PATH} +``` + +Start: `cd ai_agents && docker compose up -d` + +## Persistent Startup (Survives Session Closure) + +Use `-d` flag with `docker exec` to keep services running after terminal disconnect: + +```bash +# 1. Clean up existing processes +sudo docker exec ten_agent_dev bash -c "pkill -9 -f 'bin/api'; pkill -9 node; pkill -9 bun" +ps -elf | grep 'bin/main' | grep -v grep | awk '{print $4}' | xargs -r sudo kill -9 2>/dev/null + +# 2. Remove stale lock files +sudo docker exec ten_agent_dev bash -c "rm -f /app/playground/.next/dev/lock" + +# 3. Install Python dependencies +sudo docker exec ten_agent_dev bash -c \ + "cd /app/agents/examples/voice-assistant-advanced/tenapp && bash scripts/install_python_deps.sh" + +# 4. Start everything in detached mode +sudo docker exec -d ten_agent_dev bash -c \ + "cd /app/agents/examples/voice-assistant-advanced && task run > /tmp/task_run.log 2>&1" + +# 5. Wait and verify +sleep 15 +curl -s http://localhost:8080/health && echo " API ready" +curl -s http://localhost:8080/graphs | jq -r '.data | length' | xargs echo "Graphs:" +curl -s http://localhost:3000 -o /dev/null -w '%{http_code}' | xargs echo "Playground:" +``` + +Key: `-d` flag keeps processes running. `task run` starts API + playground + TMAN Designer. + +## Cloudflare Tunnel (Free HTTPS) + +Quick HTTPS access without domain or SSL setup: + +```bash +# Start tunnel +pkill cloudflared +nohup cloudflared tunnel --url http://localhost:3000 > /tmp/cloudflare_tunnel.log 2>&1 & +sleep 5 + +# Get the random URL +grep -o 'https://[^[:space:]]*\.trycloudflare\.com' /tmp/cloudflare_tunnel.log | head -1 +# Example: https://films-colon-msgid-incentives.trycloudflare.com +``` + +- Free tunnels get **random URLs** that change on restart +- No DNS configuration needed +- Good for development and demos + +## Nginx Reverse Proxy (Production HTTPS) + +For production with custom domain and SSL certificates: + +```nginx +server { + listen [::]:453 ssl ipv6only=on; + listen 453 ssl; + ssl_certificate /etc/letsencrypt/live/oai.agora.io/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/oai.agora.io/privkey.pem; + include /etc/letsencrypt/options-ssl-nginx.conf; + ssl_dhparam /etc/letsencrypt/ssl-dhparams.pem; + + # API endpoints + location ~ ^/(health|ping|token|start|stop|graphs|list)(/|$) { + proxy_pass http://localhost:8080; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # Playground (with WebSocket upgrade) + location / { + proxy_pass http://localhost:3000; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + } +} +``` + +Apply: `sudo nginx -t && sudo systemctl reload nginx` + +## Production Build + +```bash +# Build optimized frontend +docker exec ten_agent_dev bash -c "cd /app/playground && npm run build" + +# Start production server +docker exec -d ten_agent_dev bash -c \ + "cd /app/playground && npm start > /tmp/playground_prod.log 2>&1" +``` + +## Grafana Monitoring + +Located in `tools/grafana-monitoring/`. Three deployment modes: + +### Pull Mode (Development) + +Prometheus scrapes a metrics endpoint exposed by the TEN runtime: + +```json +// In property.json +{ + "ten": { + "exporter": { + "enabled": true, + "type": "prometheus", + "prometheus": { + "listen_address": "0.0.0.0", + "listen_port": 49484 + } + } + } +} +``` + +Setup: `cd tools/grafana-monitoring && docker compose -f docker-compose.pull.yml up -d` + +### Push Mode (Production) + +Uses OTEL Collector to push metrics to Prometheus and logs to Loki: + +```json +// In property.json +{ + "ten": { + "exporter": { + "enabled": true, + "type": "otlp", + "otlp": { + "endpoint": "http://otel-collector:4317" + } + } + } +} +``` + +Setup: `cd tools/grafana-monitoring && docker compose -f docker-compose.push.yml up -d` + +### Hybrid Mode + +Both Pull and Push simultaneously — useful for A/B testing or migration. + +### Monitored Metrics + +| Metric | Good Threshold | What It Measures | +| --------------------------------- | -------------- | ----------------------------------- | +| Extension Lifecycle Duration | < 1 second | on_configure, on_init, on_start, on_stop, on_deinit | +| Extension CMD Processing Duration | < 100ms | P50/P95 command handling time | +| Thread Message Queue Wait Time | < 50ms | Time messages wait before processing| + +### Log Aggregation (Push Mode Only) + +Push mode sends logs to Loki for centralized querying: + +``` +# LogQL query examples +{service_name="ten_agent"} |= "error" +{service_name="ten_agent"} | json | level="error" +{service_name="ten_agent"} |= "deepgram" | json +``` + +## After Container Restart Checklist + +1. Reinstall Python dependencies (not persisted) +2. Start server with `task run` +3. Restart Cloudflare tunnel (if using) +4. Kill any zombie worker processes on host +5. Verify with `/health` and `/graphs` endpoints + +## See Also + +- [Back to Setup](../01_setup.md) +- [Server Architecture](server_architecture.md) — Worker lifecycle, session management diff --git a/docs/ai/L1/deep_dives/extension_development.md b/docs/ai/L1/deep_dives/extension_development.md new file mode 100644 index 0000000000..ba8588742d --- /dev/null +++ b/docs/ai/L1/deep_dives/extension_development.md @@ -0,0 +1,653 @@ +# Extension Development + +> **When to Read This:** Load this document when you are creating a new TTS, ASR, or LLM +> extension. It covers the exact files to create, base classes to inherit, abstract methods +> to implement, test configs to provide, and guarder tests your extension must pass. + +## Quick Start: Copy an Existing Extension + +The fastest way to create a new extension is to copy a similar one: + +| Extension Type | Good Template to Copy | Base Class | +| -------------- | ------------------------------- | --------------------------- | +| TTS (HTTP) | `rime_http_tts` | `AsyncTTS2HttpExtension` | +| TTS (WebSocket)| `deepgram_tts` | `AsyncTTS2BaseExtension` | +| ASR (WebSocket)| `deepgram_asr_python` | `AsyncASRBaseExtension` | +| LLM | `openai_llm2_python` | `AsyncLLMBaseExtension` | +| LLM Tool | `bingsearch_tool_python` | `AsyncLLMToolBaseExtension` | + +```bash +cp -r agents/ten_packages/extension/deepgram_tts agents/ten_packages/extension/my_vendor_tts +# Then rename: addon decorator, class names, manifest.json name field +``` + +## Directory Structure + +``` +my_vendor_tts_python/ +├── __init__.py # Can be empty +├── addon.py # Registration (MUST match manifest.json name) +├── extension.py # Main logic OR orchestration +├── my_vendor_tts.py # Vendor client (websocket/http logic) +├── config.py # Pydantic config model +├── manifest.json # Metadata + API interface + property schema +├── property.json # Defaults with ${env:VAR} syntax +├── requirements.txt # Python deps +├── README.md # Usage docs +└── tests/ + ├── bin/ + │ └── start # Test entry script (sets PYTHONPATH, runs pytest) + └── configs/ + ├── property.json # Default test config + ├── property_basic_audio_setting1.json # Sample rate test 1 (e.g. 16000) + ├── property_basic_audio_setting2.json # Sample rate test 2 (e.g. 24000) + ├── property_dump.json # Audio dump test config + ├── property_miss_required.json # Missing API key test + └── property_invalid.json # Invalid API key test +``` + +## Step 1: addon.py + +```python +from ten_runtime import Addon, register_addon_as_extension, TenEnv + +@register_addon_as_extension("my_vendor_tts_python") +class MyVendorTTSAddon(Addon): + def on_create_instance(self, ten: TenEnv, addon_name: str, context) -> None: + from .extension import MyVendorTTSExtension + ten.on_create_instance_done(MyVendorTTSExtension(addon_name), context) +``` + +The decorator name **must exactly match** `manifest.json` `name` field AND the `addon` +field in graph nodes. + +## Step 2: config.py + +```python +from pydantic import BaseModel, Field +from typing import Any +import copy +from ten_ai_base import utils + +class MyVendorTTSConfig(BaseModel): + api_key: str = "" + model: str = "default-model" + sample_rate: int = 24000 + dump: bool = False + dump_path: str = "" + params: dict[str, Any] = Field(default_factory=dict) + + def validate(self) -> None: + key = self.api_key or self.params.get("api_key", "") + if not key: + raise ValueError("API key is required") + + def to_str(self, sensitive_handling: bool = True) -> str: + if not sensitive_handling: + return f"{self}" + config = copy.deepcopy(self) + if config.params and "api_key" in config.params: + config.params["api_key"] = utils.encrypt(config.params["api_key"]) + return f"{config}" +``` + +## Step 3: manifest.json + +```json +{ + "type": "extension", + "name": "my_vendor_tts_python", + "version": "0.1.0", + "dependencies": [ + {"type": "system", "name": "ten_runtime_python", "version": "0.8"} + ], + "api": { + "interface": [ + {"import_uri": "../../system/ten_ai_base/api/tts-interface.json"} + ], + "property": { + "api_key": {"type": "string"}, + "model": {"type": "string"}, + "sample_rate": {"type": "int32"}, + "dump": {"type": "bool"}, + "dump_path": {"type": "string"}, + "params": {"type": "object"} + } + } +} +``` + +Use `tts-interface.json` for TTS, `asr-interface.json` for ASR, `llm-interface.json` for LLM. + +## Step 4: property.json + +```json +{ + "api_key": "${env:MY_VENDOR_API_KEY}", + "model": "default-model", + "sample_rate": 24000 +} +``` + +## Step 5: extension.py — Implementing the Base Class + +### TTS Extension (WebSocket Mode) + +```python +from ten_ai_base.tts2 import AsyncTTS2BaseExtension + +class MyVendorTTSExtension(AsyncTTS2BaseExtension): + def vendor(self) -> str: + return "my_vendor" + + async def on_init(self, ten_env) -> None: + await super().on_init(ten_env) + config_json, _ = await ten_env.get_property_to_json("") + self.config = MyVendorTTSConfig(**json.loads(config_json)) + self.config.validate() + + async def on_start(self, ten_env) -> None: + await super().on_start(ten_env) + self.client = MyVendorTTSClient(self.config, ten_env) + await self.client.connect() + + async def on_stop(self, ten_env) -> None: + await super().on_stop(ten_env) + await self.client.close() + + async def request_tts(self, tts_text_input) -> AsyncIterator[tuple[bytes, int | None]]: + text = tts_text_input.get_text() + request_id = tts_text_input.get_request_id() + async for audio_chunk in self.client.synthesize(text, request_id): + yield audio_chunk, None # (bytes, event_status) + + async def cancel_tts(self) -> None: + await self.client.cancel() + + def synthesize_audio_sample_rate(self) -> int: + return self.config.sample_rate + + def synthesize_audio_channels(self) -> int: + return 1 # mono + + def synthesize_audio_sample_width(self) -> int: + return 2 # 16-bit +``` + +**TTS2 state machine**: The base class manages request states automatically: +QUEUED -> PROCESSING -> FINALIZING -> COMPLETED. Your `request_tts()` just yields audio bytes. + +**Output events sent automatically** by the base class: +- `tts_audio_start` — when first audio chunk is ready +- `pcm_frame` — for each audio chunk +- `tts_audio_end` — when request completes +- `tts_error` — on failure + +### TTS Extension (HTTP Mode) + +Simpler — for non-streaming HTTP APIs: + +```python +from ten_ai_base.tts2_http import AsyncTTS2HttpExtension + +class MyVendorTTSExtension(AsyncTTS2HttpExtension): + def vendor(self) -> str: + return "my_vendor" + + async def request_tts(self, text: str, request_id: str) -> AsyncIterator[bytes]: + async with httpx.AsyncClient() as client: + async with client.stream("POST", self.url, json={"text": text}) as resp: + async for chunk in resp.aiter_bytes(): + yield chunk + + def synthesize_audio_sample_rate(self) -> int: + return self.config.sample_rate +``` + +### ASR Extension + +```python +from ten_ai_base.asr import AsyncASRBaseExtension + +class MyVendorASRExtension(AsyncASRBaseExtension): + def vendor(self) -> str: + return "my_vendor" + + async def start_connection(self) -> None: + self.ws = await websockets.connect(self.url, headers=self.auth_headers) + # Start a listener task for results + asyncio.create_task(self._listen_for_results()) + + async def stop_connection(self) -> None: + if self.ws: + await self.ws.close() + self.ws = None + + async def send_audio(self, frame) -> bool: + buf = frame.lock_buf() + data = bytes(buf) + frame.unlock_buf(buf) + await self.ws.send(data) + return True + + async def finalize(self) -> None: + await self.ws.send(json.dumps({"type": "CloseStream"})) + # Wait for final results before returning + + def is_connected(self) -> bool: + return self.ws is not None and self.ws.open + + def input_audio_sample_rate(self) -> int: + return 16000 + + async def _listen_for_results(self): + async for msg in self.ws: + result = json.loads(msg) + if result.get("is_final"): + asr_result = ASRResult(text=result["transcript"], language="en-US", ...) + await self.send_asr_result(asr_result) +``` + +**ASR output methods** you must call: +- `await self.send_asr_result(asr_result)` — for each transcription +- `await self.send_asr_error(error, vendor_info)` — on vendor errors +- `await self.send_asr_finalize_end()` — when finalize completes + +**Buffer strategy**: Override `buffer_strategy()` to return `ASRBufferConfigModeKeep` +if you want audio buffered during disconnects (default discards). + +### LLM Extension + +```python +from ten_ai_base.llm import AsyncLLMBaseExtension + +class MyLLMExtension(AsyncLLMBaseExtension): + async def on_call_chat_completion(self, ten_env, **kwargs): + # Handle command-based chat requests + pass + + async def on_data_chat_completion(self, ten_env, **kwargs): + # Handle stream-based data input + pass + + async def on_tools_update(self, ten_env, tool_metadata): + async with self._available_tools_lock: + self.available_tools = tool_metadata +``` + +--- + +## TTS Audio Pipeline: Data Types and Flow + +Understanding the data types is critical for implementing TTS extensions correctly. + +### Data Flow Through the Pipeline + +``` +User speaks → Agora RTC → pcm_frame → ASR → asr_result → main_control + → text_data → LLM → text_data → main_control → tts_text_input → TTS + → pcm_frame → Agora RTC → User hears +``` + +### tts_text_input (incoming to your extension) + +```python +class TTSTextInput: + request_id: str # Unique request identifier + text: str # Text chunk to synthesize + text_input_end: bool # True = last chunk for this request_id + metadata: dict # Context: {session_id, turn_id, ...} +``` + +- Multiple `tts_text_input` messages can share one `request_id` (the "append" pattern) +- `text_input_end=True` signals no more text is coming for this request +- The base class handles queuing and buffering — your `request_tts()` receives complete inputs + +### tts_audio_start / tts_audio_end (outgoing from your extension) + +These are sent automatically by the base class. You don't need to send them manually. + +```json +// tts_audio_start +{"request_id": "req1", "metadata": {"session_id": "sess1", "turn_id": 1}} + +// tts_audio_end +{ + "request_id": "req1", + "request_event_interval_ms": 1500, + "request_total_audio_duration_ms": 3200, + "reason": 1, + "metadata": {"session_id": "sess1", "turn_id": 1} +} +``` + +**Reason values**: `REQUEST_END` (1) = normal completion, `INTERRUPTED` (2) = flush/cancel, +`ERROR` (3) = failure. + +### tts_flush / tts_flush_end + +Flush is triggered when the user interrupts (speaks while TTS is playing). + +```json +// tts_flush (incoming signal) +{"flush_id": "flush_abc123", "metadata": {"session_id": "sess1"}} + +// tts_flush_end (your extension's response — sent automatically by base class) +{"flush_id": "flush_abc123", "metadata": {"session_id": "sess1"}} +``` + +**Critical**: `flush_id` and `metadata` must be echoed back exactly. + +## Flush Handling in TTS Extensions + +The base class (`AsyncTTS2BaseExtension`) handles most flush logic automatically. +Your extension only needs to implement `cancel_tts()`: + +```python +async def cancel_tts(self) -> None: + """Called when a flush signal arrives. Stop any in-progress synthesis.""" + if self.client: + await self.client.cancel() +``` + +### What the Base Class Does on Flush + +1. Acquires `_put_lock` to block new `tts_text_input` arrivals +2. Clears `_flush_complete_event` to prevent race conditions +3. Flushes the internal queue (discards all pending items) +4. Calls `cancel_tts()` on your extension (you stop the vendor API) +5. Sends `tts_audio_end` with `reason=INTERRUPTED` for the current request +6. Sends `tts_flush_end` with the echoed `flush_id` and `metadata` +7. Resets all request state (ready for next request) +8. Sets `_flush_complete_event` to re-enable queue processing + +### Request Interleaving (How Buffering Works) + +When multiple requests arrive with different `request_id`s: + +1. First request is processed immediately (`_processing_request_id = "req1"`) +2. Messages for other request_ids are **buffered** in `_pending_messages` +3. When req1 completes, the next buffered request is released (FIFO order) +4. Each request maintains strict event ordering: `audio_start → frames → audio_end` + +Your `request_tts()` doesn't need to handle interleaving — the base class does it. + +## The Three property.json Files + +There are three distinct `property.json` files with different roles: + +### 1. Extension Defaults (`agents/ten_packages/extension//property.json`) + +Default config for the extension. Loaded when no overrides are specified: + +```json +{ + "api_key": "${env:MY_VENDOR_API_KEY}", + "model": "default-model", + "sample_rate": 24000 +} +``` + +### 2. App Graph Definition (`agents/examples//tenapp/property.json`) + +Defines the complete agent — nodes, connections, per-instance overrides: + +```json +{ + "ten": { + "predefined_graphs": [{ + "name": "voice_assistant", + "graph": { + "nodes": [ + {"name": "tts", "addon": "my_vendor_tts_python", + "property": {"model": "high-quality", "sample_rate": 24000}} + ], + "connections": [...] + } + }] + } +} +``` + +Properties here **override** extension defaults for this specific graph instance. + +### 3. Test Configs (`agents/ten_packages/extension//tests/configs/*.json`) + +Used by guarder tests. Each test loads a specific config file: + +```json +{ + "dump": true, + "dump_path": "./tests/dump_output/", + "params": {"key": "${env:MY_VENDOR_API_KEY}", "sample_rate": 16000} +} +``` + +**Loading order**: Extension defaults → App graph overrides → Test config overrides. + +--- + +## Step 6: Test Configuration Files + +Your extension's `tests/configs/` directory needs these config files for the guarder tests to work: + +### For TTS Extensions + +| Config File | Purpose | Content | +| ------------------------------------ | -------------------------------------- | -------------------------------------- | +| `property.json` | Default test config | Valid API key, default model/settings | +| `property_basic_audio_setting1.json` | Sample rate test 1 | `sample_rate: 16000` + valid key | +| `property_basic_audio_setting2.json` | Sample rate test 2 | `sample_rate: 24000` + valid key | +| `property_dump.json` | Audio dump test | `dump: true, dump_path: "./tests/dump_output/"` | +| `property_miss_required.json` | Missing params error test | Empty API key | +| `property_invalid.json` | Invalid params error test | Empty or invalid API key | + +**Example `property.json`** (for elevenlabs): +```json +{ + "params": { + "key": "${env:ELEVENLABS_TTS_KEY}", + "model_id": "eleven_turbo_v2_5" + } +} +``` + +**Example `property_basic_audio_setting1.json`**: +```json +{ + "dump": true, + "dump_path": "./tests/keep_dump_output/", + "params": { + "sample_rate": 16000, + "key": "${env:ELEVENLABS_TTS_KEY}" + } +} +``` + +**Example `property_basic_audio_setting2.json`**: +```json +{ + "dump": true, + "dump_path": "./tests/keep_dump_output/", + "params": { + "sample_rate": 24000, + "key": "${env:ELEVENLABS_TTS_KEY}" + } +} +``` + +**Example `property_miss_required.json`**: +```json +{ + "params": {"key": ""} +} +``` + +### For ASR Extensions + +| Config File | Purpose | Content | +| ------------------------ | -------------------------- | ------------------------------------ | +| `property_en.json` | English transcription test | Valid key + `language: "en-US"` | +| `property_zh.json` | Chinese transcription test | Valid key + `language: "zh-CN"` | +| `property_invalid.json` | Error handling test | `key: "invalid", region: "invalid"` | +| `property_dump.json` | Audio dump test | Valid key + `dump: true` | + +--- + +## Step 7: TTS Guarder Tests Your Extension Must Pass + +Run with: `task tts-guarder-test EXTENSION=my_vendor_tts_python` + +There are **15 tests**. Here's what each validates: + +### Must-Pass Tests + +| Test | What It Validates | +| --------------------------------------- | -------------------------------------------------------- | +| `test_append_input` | Multiple text inputs appended with same request_id | +| `test_append_input_stress` | High volume of append operations | +| `test_append_input_without_text_input_end` | Missing text_input_end flags handled gracefully | +| `test_append_interrupt` | New requests interrupting in-progress ones | +| `test_basic_audio_setting` | Different sample rates produce different audio | +| `test_corner_input` | Special chars, emojis, very short/long text | +| `test_dump` | Audio dump files created with valid PCM data | +| `test_dump_each_request_id` | Each request_id produces separate dump file | +| `test_empty_text_request` | Empty/whitespace text: audio_end within 500ms, no crash | +| `test_flush` | Flush signal: receives flush_end, no data after 5s | +| `test_interleaved_requests` | 8 concurrent requests maintain separate audio streams | +| `test_invalid_required_params` | Invalid API key returns FATAL ERROR, no crash | +| `test_invalid_text_handling` | Malformed text handled without crash | +| `test_metrics` | TTFB metrics generated with valid timestamps | +| `test_miss_required_params` | Missing API key returns appropriate error | + +### Critical Pass Criteria + +- **Event ordering**: `tts_audio_start` -> `pcm_frame`(s) -> `tts_audio_end` per request +- **Request isolation**: Interleaved requests must not mix audio streams +- **Error handling**: Invalid/missing configs must produce errors, never crashes +- **Empty text**: Must complete quickly (audio_end within 500ms), no audio generated +- **Flush**: After flush_end, no more data for 5 seconds +- **Dump files**: Valid PCM data, one file per request_id when enabled + +## Step 8: ASR Guarder Tests Your Extension Must Pass + +Run with: `task asr-guarder-test EXTENSION=my_vendor_asr_python` + +There are **10 tests** (1 skipped by default): + +| Test | What It Validates | +| --------------------------- | ------------------------------------------------------------ | +| `test_connection_timing` | Connects and transcribes English audio correctly | +| `test_asr_result` | Result structure: id, text, language, session_id fields | +| `test_asr_finalize` | Finalize signal produces final=True result + finalize_end | +| `test_reconnection` | Recovers gracefully after connection failure | +| `test_vendor_error` | Invalid creds produce proper error with vendor info | +| `test_multi_language` | English (en-US) and Chinese (zh-CN) both transcribe correctly| +| `test_dump` | Audio dump files created correctly | +| `test_metrics` | TTFW and TTLW metrics: positive, TTLW > TTFW | +| `test_audio_timestamp` | start_ms and duration_ms accuracy | +| `test_long_duration_stream` | **Skipped by default** — 5+ min stream without timeout | + +### Critical Pass Criteria + +- **Result fields**: Every ASR result must have `id`, `text`, `language`, `session_id` +- **Finalize**: Must produce `final=True` result and `asr_finalize_end` response +- **Error format**: Errors must have `id`, `module`, `code`, `message` + vendor info +- **Metrics**: TTFW > 0, TTLW > TTFW, both in milliseconds +- **Audio format**: Accepts 16-bit PCM, 16kHz, mono, 320 bytes per frame + +--- + +## AudioFrame Creation Pattern + +```python +from ten_runtime import AudioFrame, AudioFrameDataFmt + +frame = AudioFrame.create("pcm_frame") +frame.set_sample_rate(16000) +frame.set_bytes_per_sample(2) # 16-bit +frame.set_number_of_channels(1) # Mono +frame.set_data_fmt(AudioFrameDataFmt.INTERLEAVE) +frame.set_samples_per_channel(len(pcm_data) // 2) +frame.alloc_buf(len(pcm_data)) +buf = frame.lock_buf() +buf[:] = pcm_data +frame.unlock_buf(buf) +await ten_env.send_audio_frame(frame) +``` + +Set all properties **before** `alloc_buf()`. + +## Params Dict Pattern + +For HTTP/WebSocket vendor APIs: + +1. Store all config including `api_key` in `params` dict +2. Extract `api_key` for auth headers in client constructor +3. Strip `api_key` from params **only when building the HTTP request payload** +4. In `update_params()`: add vendor-required params, normalize keys + +```python +# Client constructor +self.api_key = config.params.get("api_key", "") +self.headers = {"Authorization": f"Bearer {self.api_key}"} + +# Request method +payload = {**self.config.params} +payload.pop("api_key", None) +``` + +## Bidirectional Extension Pattern + +For extensions that both receive from and send to the graph: + +```python +class MyBridge(AsyncExtension): + async def on_init(self, ten_env): + self.ten_env = ten_env # Store for callbacks + + async def on_audio_frame(self, ten_env, audio_frame): + buf = audio_frame.lock_buf() + self.external_system.send(bytes(buf)) + audio_frame.unlock_buf(buf) + + async def _external_callback(self, data): + frame = AudioFrame.create("pcm_frame") + # ... fill frame ... + await self.ten_env.send_audio_frame(frame) +``` + +## Pre-Submission Checklist + +- [ ] `addon.py` decorator name matches `manifest.json` `name` field +- [ ] All abstract methods implemented (vendor, request_tts/send_audio, etc.) +- [ ] Config validation raises ValueError for missing required params +- [ ] `to_str()` encrypts sensitive fields before logging +- [ ] `tests/configs/` has all required config files (see Step 6) +- [ ] `task tts-guarder-test` or `task asr-guarder-test` passes +- [ ] `task format` passes (Black, line-length 80) +- [ ] `task lint-extension EXTENSION=my_vendor_tts_python` passes +- [ ] `requirements.txt` lists all Python dependencies +- [ ] `README.md` documents config properties and env vars +- [ ] No hardcoded API keys anywhere + +## Language-Specific Notes + +| Language | Create Command | +| ---------- | -------------------------------------------------------------------- | +| Python | `tman create extension name --template default_async_extension_python` | +| Go | `tman create extension name --template default_extension_go` | +| C++ | `tman create extension name --template default_extension_cpp` | +| Node.js | `tman create extension name --template default_extension_nodejs` | + +## Portal References (Full Guides) + +- [Create a TTS Extension (89K)](https://github.com/TEN-framework/portal/blob/main/content/docs/ten_agent_examples/extension_dev/create_tts_extension.mdx) [EXTERNAL] +- [Create an ASR Extension (39K)](https://github.com/TEN-framework/portal/blob/main/content/docs/ten_agent_examples/extension_dev/create_asr_extension.mdx) [EXTERNAL] +- [Create a Hello World Extension](https://github.com/TEN-framework/portal/blob/main/content/docs/ten_agent_examples/extension_dev/create_hello_world_extension.mdx) [EXTERNAL] + +## See Also + +- [Back to Conventions](../04_conventions.md) +- [Back to Workflows](../05_workflows.md) +- [Testing](testing.md) — Full guarder test details and debugging diff --git a/docs/ai/L1/deep_dives/graph_configuration.md b/docs/ai/L1/deep_dives/graph_configuration.md new file mode 100644 index 0000000000..bfabb78ac8 --- /dev/null +++ b/docs/ai/L1/deep_dives/graph_configuration.md @@ -0,0 +1,410 @@ +# Graph Configuration + +> **When to Read This:** Load this document when you are modifying graph definitions +> in property.json, adding extensions to agent pipelines, or debugging data flow issues. + +## Overview + +Graphs define which extensions run and how they communicate. They are declared in +`property.json` under the `predefined_graphs` array within the `ten` namespace. + +## Property.json Structure + +```json +{ + "ten": { + "log": { + "handlers": [...] + }, + "predefined_graphs": [ + { + "name": "voice_assistant", + "auto_start": true, + "graph": { + "nodes": [...], + "connections": [...] + } + } + ] + } +} +``` + +- `name` — graph identifier, used in `/start` request's `graph_name` field +- `auto_start` — set to `true` by the server for the selected graph at runtime +- `graph.nodes` — extension instances +- `graph.connections` — data flow wiring + +## Node Schema + +```json +{ + "type": "extension", + "name": "stt", + "addon": "deepgram_asr_python", + "extension_group": "transcription_group", + "property": { + "api_key": "${env:DEEPGRAM_API_KEY}", + "model": "nova-2", + "language": "en-US", + "sample_rate": 16000 + } +} +``` + +| Field | Required | Purpose | +| ----------------- | -------- | ------------------------------------------------- | +| `type` | Yes | Always `"extension"` | +| `name` | Yes | Instance name (used in connections) | +| `addon` | Yes | Extension package name (must match manifest.json) | +| `extension_group` | No | Thread grouping for extensions | +| `property` | No | Config overrides merged with extension defaults | + +## Connection Schema + +Connections define how messages flow between extensions: + +```json +{ + "extension": "main", + "cmd": [ + {"name": "flush", "dest": [{"extension": "llm"}, {"extension": "tts"}]}, + {"name": "on_user_joined", "source": [{"extension": "agora_rtc"}]} + ], + "data": [ + {"name": "text_data", "source": [{"extension": "llm"}]}, + {"name": "text_data", "dest": [{"extension": "tts"}]} + ] +} +``` + +Each connection block is **from the perspective of the named extension**: +- `source` — "this extension receives X from these sources" +- `dest` — "this extension sends X to these destinations" + +## Full Graph Example + +A basic voice assistant pipeline (ASR → LLM → TTS): + +```json +{ + "name": "voice_assistant", + "auto_start": false, + "graph": { + "nodes": [ + { + "type": "extension", "name": "agora_rtc", "addon": "agora_rtc", + "extension_group": "rtc_group", + "property": {"app_id": "${env:AGORA_APP_ID}", "channel": "default"} + }, + { + "type": "extension", "name": "stt", "addon": "deepgram_asr_python", + "extension_group": "stt_group", + "property": {"api_key": "${env:DEEPGRAM_API_KEY}", "model": "nova-2"} + }, + { + "type": "extension", "name": "llm", "addon": "openai_llm2_python", + "extension_group": "llm_group", + "property": {"api_key": "${env:OPENAI_API_KEY}", "model": "${env:OPENAI_MODEL}"} + }, + { + "type": "extension", "name": "tts", "addon": "elevenlabs_tts2_python", + "extension_group": "tts_group", + "property": {"api_key": "${env:ELEVENLABS_TTS_KEY}"} + } + ], + "connections": [ + { + "extension": "agora_rtc", + "audio_frame": [ + {"name": "pcm_frame", "dest": [{"extension": "stt"}]} + ] + }, + { + "extension": "stt", + "data": [ + {"name": "asr_result", "dest": [{"extension": "main"}]} + ] + }, + { + "extension": "main", + "cmd": [ + {"name": "flush", "dest": [{"extension": "llm"}, {"extension": "tts"}]}, + {"name": "on_user_joined", "source": [{"extension": "agora_rtc"}]} + ], + "data": [ + {"name": "text_data", "source": [{"extension": "llm"}]}, + {"name": "text_data", "dest": [{"extension": "tts"}]} + ] + }, + { + "extension": "tts", + "data": [ + {"name": "tts_text_input", "source": [{"extension": "main"}]} + ], + "audio_frame": [ + {"name": "pcm_frame", "dest": [{"extension": "agora_rtc"}]} + ] + } + ] + } +} +``` + +## Connection Types Reference + +| Type | Direction | Payload | Example Names | +| ------------- | --------- | ------------------ | ----------------------------------- | +| `cmd` | Both | Named commands | `flush`, `tool_register`, `on_user_joined`, `chat_completion_call`, `update_configs` | +| `data` | Both | Named data msgs | `asr_result`, `text_data`, `tts_text_input`, `tts_audio_start`, `tts_audio_end`, `error` | +| `audio_frame` | Both | PCM audio streams | `pcm_frame` | +| `video_frame` | Both | Video streams | `video_frame` | + +## Parallel Audio Routing + +When sending audio to multiple destinations, split at the **source node**: + +```json +// CORRECT — split at agora_rtc (source) +{ + "extension": "agora_rtc", + "audio_frame": [ + {"name": "pcm_frame", "dest": [ + {"extension": "stt"}, + {"extension": "vad"} + ]} + ] +} +``` + +Do NOT split from intermediate nodes — this causes runtime crashes. + +## Property Injection + +When the server processes a `/start` request, it dynamically modifies the graph: + +1. **Graph selection**: Filters `predefined_graphs` to match `graph_name`, sets `auto_start: true` +2. **Channel injection**: Scans all nodes — any node with a `"channel"` property gets `channel_name` injected +3. **Start params**: Injects `remote_stream_id`, `bot_stream_id`, `token` via `startPropMap` +4. **Extension overrides**: Merges `req.Properties[extensionName]` into matching node properties +5. **Env var validation**: Resolves all `${env:VAR}` references + +This is why `agora_rtc` and any custom extension with a `"channel"` property automatically +receive the dynamic channel name without code changes. + +## Adding a New Graph + +1. Add a new entry to `predefined_graphs[]` in the example's `tenapp/property.json` +2. Ensure all referenced extensions are listed in `tenapp/manifest.json` +3. Run `tman install` to create symlinks for new dependencies +4. **Nuclear restart** required (frontend caches the graph list) + +## Generating property.json with rebuild_property.py + +For complex deployments with many graph variants, hand-editing property.json is +error-prone. The `voice-assistant-advanced` example uses a Python script to generate +it programmatically: + +**Location**: `agents/examples/voice-assistant-advanced/tenapp/rebuild_property.py` + +**Usage**: +```bash +docker exec ten_agent_dev bash -c \ + "cd /app/agents/examples/voice-assistant-advanced/tenapp && python3 rebuild_property.py" +``` + +### How It Works + +The script defines reusable **node configs** as Python dicts, then assembles them +into graphs with helper functions: + +```python +# 1. Define reusable node configs +nova3_stt_100ms = { + "type": "extension", "name": "stt", "addon": "deepgram_ws_asr_python", + "extension_group": "stt", + "property": { + "params": { + "api_key": "${env:DEEPGRAM_API_KEY}", + "model": "nova-3", "language": "en-US", + "interim_results": True, "endpointing": 100, + } + }, +} + +cartesia_tts_sonic3 = { + "type": "extension", "name": "tts", "addon": "cartesia_tts", + "extension_group": "tts", + "property": { + "dump": False, "dump_path": "./", + "params": { + "api_key": "${env:CARTESIA_TTS_KEY}", + "model_id": "sonic-3", + "output_format": {"container": "raw", "sample_rate": 44100}, + }, + }, +} + +gpt51_llm = { + "type": "extension", "name": "llm", "addon": "openai_llm2_python", + "extension_group": "chatgpt", + "property": { + "base_url": "https://api.openai.com/v1", + "api_key": "${env:OPENAI_API_KEY}", + "model": "gpt-5.1", "max_tokens": 1000, + "prompt": "...", "greeting": "...", + }, +} + +# 2. Define reusable connection templates +basic_connections = [ + {"extension": "main_control", "cmd": [...], "data": [...]}, + {"extension": "agora_rtc", "audio_frame": [...], "data": [...]}, + {"extension": "streamid_adapter", "audio_frame": [...]}, + {"extension": "tts", "data": [...], "audio_frame": [...]}, + # ... +] + +# 3. Assemble graphs with helper functions +def create_basic_voice_assistant(name, has_avatar=False, avatar_type=None, + tts_config=None, stt_config=None, llm_config=None): + nodes = [agora_rtc_base, stt_config or nova3_stt_100ms, llm_config or ..., ...] + connections = copy.deepcopy(basic_connections) + if has_avatar: + # Modify connections: route TTS audio through avatar instead of direct to RTC + ... + return {"name": name, "auto_start": False, "graph": {"nodes": nodes, "connections": connections}} + +# 4. Build graph list and write property.json +new_graphs = [ + create_basic_voice_assistant("voice_assistant"), + create_basic_voice_assistant("voice_assistant_heygen", has_avatar=True, avatar_type="heygen"), + create_apollo_graph("flux_apollo_gpt_5_1_cartesia", gpt51_llm, flux_stt), + # ... +] + +new_data = {"ten": {"log": log_config, "predefined_graphs": new_graphs}} +with open("property.json", "w") as f: + json.dump(new_data, f, indent=2) +``` + +### Key Patterns in rebuild_property.py + +| Pattern | Purpose | +| ---------------------------- | ---------------------------------------------------- | +| `copy.deepcopy(config)` | Prevent mutation when reusing node configs | +| Parametric helper functions | `create_basic_voice_assistant(name, tts_config=...)` | +| Connection rewiring for avatars | Route TTS audio through avatar instead of direct to RTC | +| Preserve existing log config | `log_config = data["ten"]["log"]` before overwriting | +| Commented-out graph groups | Keep old graph definitions for reference/reactivation| + +### When to Use rebuild_property.py + +- **Multiple graph variants** (A/B testing vendors: Deepgram vs Cartesia TTS) +- **Avatar variants** (same pipeline with/without HeyGen/Anam) +- **LLM model testing** (GPT-4o vs GPT-5.1 vs Groq) +- **Complex connection rewiring** (avatar graphs need different audio routing) + +For simple single-graph setups, editing property.json directly is fine. + +## Manifest.json Dependencies + +When adding an extension to a graph, ensure its dependency is in `manifest.json`: + +```json +{ + "dependencies": [ + {"type": "extension", "name": "my_vendor_tts_python", "version": "0.1.0"} + ] +} +``` + +Then run: +```bash +docker exec ten_agent_dev bash -c "cd /app/agents/examples//tenapp && tman install" +``` + +## Main Extension Customization + +The "main" extension controls agent orchestration. Three variants exist: + +| Variant | Language | Pattern | Use Case | +| -------------------- | ---------- | ---------------------------- | -------------------------- | +| Python Cascade | Python | ASR → LLM → TTS pipeline | Standard voice assistant | +| Python Realtime V2V | Python | OpenAI Realtime API | Voice-to-voice (no ASR/TTS)| +| Node.js Cascade | TypeScript | ASR → LLM → TTS pipeline | TypeScript preference | + +Key customization points: +- `on_data()` — event routing (match/case dispatcher) +- `on_cmd()` — tool registration and handling +- Greeting logic in `on_start()` or `on_user_joined` handler + +## Example Apps + +Available in `agents/examples/`. Key examples: + +| Example | Description | +| --------------------------------- | ---------------------------------------------------- | +| `voice-assistant` | Basic: Deepgram ASR + OpenAI LLM + ElevenLabs TTS | +| `voice-assistant-advanced` | Multiple graph variants, vendor A/B testing | +| `voice-assistant-realtime` | OpenAI Realtime API (voice-to-voice, no ASR/TTS) | +| `voice-assistant-video` | Vision capability added | +| `voice-assistant-live2d` | Live2D avatar integration | +| `voice-assistant-sip-twilio` | SIP phone integration (Twilio) | +| `voice-assistant-sip-telnyx` | SIP phone integration (Telnyx) | +| `voice-assistant-sip-plivo` | SIP phone integration (Plivo) | +| `voice-assistant-with-ten-vad` | Custom VAD (Voice Activity Detection) | +| `voice-assistant-with-turn-detection` | Transformer-based turn detection | +| `voice-assistant-nodejs` | Node.js implementation | +| `doodler` | Spoken prompts → hand-drawn sketches | +| `speaker-diarization` | Real-time multi-speaker identification | +| `transcription` | Audio transcription tool | +| `websocket-example` | WebSocket transport (no Agora RTC) | +| `http-control` | HTTP-based control interface | + +### voice-assistant vs voice-assistant-advanced + +| Aspect | voice-assistant | voice-assistant-advanced | +| --------------------- | --------------------------- | --------------------------------- | +| Graphs | 1 (`voice_assistant`) | 4+ variants (Flux/Apollo/Cartesia)| +| Vendor switching | Fixed components | Multiple vendor combinations | +| LLM prompts | Simple greeting | Multi-step research workflows | +| Use case | Getting started | Production A/B testing | + +Both follow the same core pipeline: +``` +Agora RTC → streamid_adapter → ASR → main_control → LLM → TTS → Agora RTC +``` + +### Real Graph: voice-assistant/tenapp/property.json + +This is a complete, working graph. Key nodes: + +| Node | Addon | Role | +| ------------------ | ------------------------ | ---------------------------------- | +| `agora_rtc` | `agora_rtc` | Audio/video transport | +| `streamid_adapter` | `streamid_adapter` | Stream ID routing | +| `stt` | `deepgram_asr_python` | Speech-to-text | +| `llm` | `openai_llm2_python` | Language model | +| `tts` | `elevenlabs_tts2_python` | Text-to-speech | +| `main_control` | `main_python` | Orchestration (greetings, routing) | +| `message_collector` | `message_collector2` | Transcript collection | + +Connection wiring: +``` +agora_rtc --pcm_frame--> streamid_adapter --pcm_frame--> stt +stt --asr_result--> main_control +main_control --text_data--> llm --text_data--> main_control --tts_text_input--> tts +tts --pcm_frame--> agora_rtc +``` + +## Portal References + +- [Understanding property.json](https://github.com/TEN-framework/portal/blob/main/content/docs/ten_agent_examples/project_structure/property_json.md) [EXTERNAL] +- [Customize Agent via Code](https://github.com/TEN-framework/portal/blob/main/content/docs/ten_agent_examples/customize_agent/modify-main/index.mdx) [EXTERNAL] + +## See Also + +- [Back to Architecture](../02_architecture.md) +- [Back to Workflows](../05_workflows.md) +- [Back to Interfaces](../06_interfaces.md) diff --git a/docs/ai/L1/deep_dives/server_architecture.md b/docs/ai/L1/deep_dives/server_architecture.md new file mode 100644 index 0000000000..18bebf0a15 --- /dev/null +++ b/docs/ai/L1/deep_dives/server_architecture.md @@ -0,0 +1,211 @@ +# Server Architecture + +> **When to Read This:** Load this document when you need to understand how the Go API +> server works, how property injection transforms graph configurations at runtime, or +> how worker processes are managed. + +## Overview + +The TEN Agent server is a Go HTTP server built with the Gin framework. It manages +agent session lifecycles — starting worker processes, injecting configuration, and +handling session keepalive/teardown. + +## Server Structure + +``` +server/ +├── main.go # Entry point, parses flags, starts HTTP server +└── internal/ + ├── http_server.go # All endpoint handlers + property injection + └── config.go # startPropMap configuration for parameter injection +``` + +Key launch flag: `-tenapp_dir=` — points to the example's `tenapp/` directory +containing `property.json` and `manifest.json`. + +## Endpoint Handlers + +| Handler | Route | Purpose | +| -------------------------------- | ------------------ | ----------------------------------- | +| `handlerHealth()` | `GET /health` | Returns `{"code":"0"}` if running | +| `handleGraphs()` | `GET /graphs` | Reads predefined_graphs from property.json | +| `handlerStart()` | `POST /start` | Spawns worker process for a session | +| `handlerStop()` | `POST /stop` | Terminates worker process | +| `handlerPing()` | `POST /ping` | Resets session timeout timer | +| `handlerList()` | `GET /list` | Lists active workers/channels | +| `handlerGenerateToken()` | `POST /token/generate` | Generates Agora RTC tokens | +| `handleAddonDefaultProperties()` | `GET /addon/default-properties` | Extension property.json files | +| `handlerVectorDocumentUpdate()` | `POST /vector/document/update` | Vector DB updates | +| `handlerVectorDocumentUpload()` | `POST /vector/document/upload` | File uploads for vector DB | + +## Property Injection Pipeline + +When `/start` is called, the server transforms the static `property.json` into a +session-specific configuration. This is the core of the `processProperty` function: + +### Step 1: Read Base Configuration + +```go +// Read property.json from the configured tenapp_dir +propertyJsonFile := filepath.Join(s.config.TenappDir, "property.json") +content, _ := os.ReadFile(propertyJsonFile) +``` + +### Step 2: Filter Graphs + +Only the requested graph is kept; its `auto_start` is set to `true`: + +```go +// Find matching graph by name +for _, graph := range predefinedGraphs { + if graph.Name == req.GraphName { + graph.AutoStart = true + filteredGraphs = append(filteredGraphs, graph) + } +} +``` + +### Step 3: Merge Dynamic Properties + +Per-extension property overrides from the request are merged: + +```go +// req.Properties = {"openai_llm2_python": {"model": "gpt-4o-mini"}} +for _, node := range graph.Nodes { + if props, ok := req.Properties[node.Name]; ok { + mergeProperties(node.Property, props) + } +} +``` + +### Step 4: Inject Start Parameters + +The `startPropMap` (defined in `config.go`) maps request fields to node properties: + +```go +var startPropMap = map[string]string{ + "RemoteStreamId": "remote_stream_id", + "BotStreamId": "agora_uid", + "Token": "token", + "WorkerHttpServerPort": "server_port", +} +``` + +These values are injected into every node that has the corresponding property defined. + +### Step 5: Channel Auto-Injection + +Any node with a `"channel"` property automatically receives the request's `channel_name`: + +```go +// Scan all nodes — if node has "channel" property, inject channel_name +for _, node := range graph.Nodes { + if _, hasChannel := node.Property["channel"]; hasChannel { + node.Property["channel"] = req.ChannelName + } +} +``` + +This is future-proof: adding a new extension with a `"channel"` property requires +zero server code changes. + +### Step 6: Environment Variable Resolution + +All `${env:VAR}` and `${env:VAR|default}` references in the property JSON are +resolved against the container's environment. + +### Step 7: Write Temp File and Spawn Worker + +The modified property JSON is written to a temporary file, and a worker process +is spawned: + +```go +// Write modified config +tmpFile := filepath.Join(tmpDir, "property.json") +os.WriteFile(tmpFile, modifiedJSON, 0644) + +// Spawn worker +cmd := exec.Command("tman", "run", "start", "--property", tmpFile) +``` + +## Worker Process Lifecycle + +``` +/start request + │ + ▼ +Server: processProperty() → temp property.json + │ + ▼ +Server: exec("tman run start --property ") + │ + ▼ +Worker process starts → loads graph → initializes extensions + │ + ├── Extensions call on_init() → on_start() + ├── Extensions process messages (cmd, data, audio_frame, video_frame) + │ + ├── /ping requests reset the timeout timer + │ + ▼ +/stop request OR timeout + │ + ▼ +Worker: extensions call on_stop() → on_deinit() + │ + ▼ +Worker process terminates +``` + +**Important**: Worker processes run on the **host machine**, not inside Docker. +They can outlive the server process and even container restarts. Always check for +zombie workers with `ps -elf | grep 'bin/main'`. + +## Session Management + +| Action | Server Behavior | +| -------------- | -------------------------------------------------- | +| `/start` | Spawns worker, stores in active workers map | +| `/stop` | Sends SIGTERM to worker, removes from map | +| `/ping` | Resets timeout timer for the channel | +| Timeout | Auto-sends SIGTERM after `timeout` seconds idle | +| `/list` | Returns all active channel → worker mappings | + +Timeout of `-1` means the session never auto-stops (requires explicit `/stop`). + +## LOG_STDOUT for Worker Output + +Worker processes write to stdout. To see their output in `/tmp/task_run.log`, +the `.env` must have: + +```bash +LOG_STDOUT=true +``` + +Without this, extension logs (Python `print()`, `ten_env.log_*()`) are invisible. + +## Security Measures + +- **Path traversal prevention**: The server ignores any client-provided `tenapp_dir` + and always uses the launch-configured path +- **Channel name sanitization**: Channel names are validated before use in file paths +- **Safe property merge**: `mergeProperties()` handles nested configs safely with + type checking + +## Configuration (config.go) + +The `startPropMap` in `config.go` controls which request fields map to which +node properties: + +| Request Field | Node Property | Purpose | +| ---------------------- | -------------------- | ------------------------------ | +| `RemoteStreamId` | `remote_stream_id` | Remote user's stream ID | +| `BotStreamId` | `agora_uid` | Bot's Agora UID | +| `Token` | `token` | Agora RTC token | +| `WorkerHttpServerPort` | `server_port` | Worker's HTTP server port | + +## See Also + +- [Back to Architecture](../02_architecture.md) +- [Graph Configuration](graph_configuration.md) — Property.json structure and connections +- [Back to Interfaces](../06_interfaces.md) diff --git a/docs/ai/L1/deep_dives/testing.md b/docs/ai/L1/deep_dives/testing.md new file mode 100644 index 0000000000..1ad2d72ecd --- /dev/null +++ b/docs/ai/L1/deep_dives/testing.md @@ -0,0 +1,295 @@ +# Testing + +> **When to Read This:** Load this document when you need to run tests for an extension, +> understand what the guarder tests validate, or debug test failures. + +## Overview + +Three levels of testing: +1. **Extension standalone tests** — per-extension unit/integration tests in `tests/` +2. **Guarder integration tests** — framework-level ASR/TTS validation suites +3. **Root-level tasks** — orchestrated via `Taskfile.yml` + +## Running Tests + +```bash +# All tests +docker exec ten_agent_dev bash -c "cd /app && task test" + +# Single extension with dependency install +docker exec ten_agent_dev bash -c \ + "cd /app && task test-extension EXTENSION=agents/ten_packages/extension/deepgram_tts" + +# Single extension, skip install (faster iteration) +docker exec ten_agent_dev bash -c \ + "cd /app && task test-extension-no-install EXTENSION=agents/ten_packages/extension/deepgram_tts" + +# TTS guarder (all 15 tests) +docker exec ten_agent_dev bash -c "cd /app && task tts-guarder-test EXTENSION=deepgram_tts" + +# ASR guarder (all 10 tests) +docker exec ten_agent_dev bash -c "cd /app && task asr-guarder-test EXTENSION=azure_asr_python" + +# Specific test only +docker exec ten_agent_dev bash -c "cd /app && task tts-guarder-test EXTENSION=deepgram_tts -- -k test_flush" +``` + +## Extension Standalone Tests + +Each extension can have `tests/` with a `bin/start` entry point: + +``` +my_extension/tests/ +├── bin/start # Sets PYTHONPATH, runs pytest +├── configs/ # Test config JSON files +│ ├── property.json +│ ├── property_basic_audio_setting1.json +│ ├── property_basic_audio_setting2.json +│ ├── property_dump.json +│ ├── property_miss_required.json +│ └── property_invalid.json +├── conftest.py # Fixtures +└── test_*.py # Test files +``` + +### PYTHONPATH + +Tests need this to import TEN runtime: + +```bash +export PYTHONPATH=".:ten_packages/system/ten_runtime_python/lib:\ +ten_packages/system/ten_runtime_python/interface:\ +ten_packages/system/ten_ai_base/interface:\ +ten_packages/extension/${EXT_NAME}:$PYTHONPATH" +``` + +--- + +## TTS Guarder Tests (15 Tests) + +**Location**: `agents/integration_tests/tts_guarder/` + +These tests run against any TTS extension. The manifest template (`manifest-tmpl.json`) +substitutes `{{extension_name}}` with your extension name at runtime. + +### Test Inventory + +| # | Test | What It Validates | Pass Criteria | +|---|------|-------------------|---------------| +| 1 | `test_append_input` | Multiple texts appended with same request_id | audio_start -> frames -> audio_end per group, correct request_id | +| 2 | `test_append_input_stress` | High volume append operations | All appends processed without errors | +| 3 | `test_append_input_without_text_input_end` | Missing text_input_end flag | Processes correctly despite missing flags | +| 4 | `test_append_interrupt` | New requests interrupting in-progress ones | Interrupts handled without crash or malformed audio | +| 5 | `test_basic_audio_setting` | Different sample rates produce different audio | Two configs with different sample_rate yield different output rates | +| 6 | `test_corner_input` | Special chars, emojis, punctuation-only, very short/long | All processed without errors | +| 7 | `test_dump` | Audio dump file creation | Dump file exists, contains valid PCM, size matches duration | +| 8 | `test_dump_each_request_id` | Separate dump files per request_id | Each request_id has own dump file | +| 9 | `test_empty_text_request` | Empty/whitespace text | audio_end within 500ms, no audio data, no crash | +| 10 | `test_flush` | Flush signal handling | Receives flush_end with matching flush_id, no data for 5s after | +| 11 | `test_interleaved_requests` | 8 concurrent requests with different request_ids | Each maintains separate audio stream, correct ordering per request | +| 12 | `test_invalid_required_params` | Invalid API key | Returns FATAL ERROR with message, no crash | +| 13 | `test_invalid_text_handling` | Malformed text, null chars, very long strings | Handled gracefully without crash | +| 14 | `test_metrics` | TTFB metric generation | Metrics data present with valid timestamps | +| 15 | `test_miss_required_params` | Missing API key | Appropriate error returned | + +### Critical TTS Invariants + +1. **Event ordering must be**: `tts_audio_start` -> `pcm_frame`(s) -> `tts_audio_end` per request +2. **Request isolation**: Interleaved requests must never mix audio streams +3. **Error handling**: Invalid/missing configs produce errors, never crashes +4. **Empty text**: Must complete fast (audio_end within 500ms), generate no audio +5. **Flush**: After flush_end, zero data output for 5 seconds + +### Required TTS Config Files + +Your `tests/configs/` must provide: + +``` +property.json # Valid API key + default settings +property_basic_audio_setting1.json # sample_rate: 16000 + valid key + dump:true +property_basic_audio_setting2.json # sample_rate: 24000 + valid key + dump:true +property_dump.json # dump:true + dump_path + valid key +property_miss_required.json # Empty/missing API key +property_invalid.json # Empty/invalid API key +``` + +**Template** (`property_basic_audio_setting1.json`): +```json +{ + "dump": true, + "dump_path": "./tests/keep_dump_output/", + "params": { + "sample_rate": 16000, + "key": "${env:MY_VENDOR_API_KEY}" + } +} +``` + +### Sample Rate Test Notes + +Some extensions don't support multiple sample rates. To skip the sample rate +comparison (test still runs, just doesn't assert rates differ), the test runner +checks `ENABLE_SAMPLE_RATE` env var. Extensions like `openai_tts_python` and +`humeai_tts_python` set this to `False`. + +--- + +## ASR Guarder Tests (10 Tests, 1 Skipped) + +**Location**: `agents/integration_tests/asr_guarder/` + +### Test Audio Format + +- 16-bit PCM, 16kHz sample rate, mono +- Test files: `test_data/16k_en_us.pcm` (English), `test_data/16k_zh_cn.pcm` (Chinese) +- Chunk size: 320 bytes per frame +- Send interval: 10ms between frames + +### Test Inventory + +| # | Test | What It Validates | Pass Criteria | +|---|------|-------------------|---------------| +| 1 | `test_connection_timing` | Connect + transcribe English audio | Results received, language="en-US" | +| 2 | `test_asr_result` | Result structure and data integrity | Fields: id, text, language, session_id all present | +| 3 | `test_asr_finalize` | Finalize signal → final result + finalize_end | final=True in result, finalize_end received | +| 4 | `test_reconnection` | Recovery after connection failure | Error detected, no crash, can reconnect | +| 5 | `test_vendor_error` | Invalid creds → proper error format | Error has id, module, code, message + vendor info | +| 6 | `test_multi_language` | English + Chinese transcription | en-US and zh-CN both detected correctly | +| 7 | `test_dump` | Audio dump functionality | Dump files created with correct data | +| 8 | `test_metrics` | TTFW and TTLW metrics | TTFW > 0, TTLW > TTFW, both in milliseconds | +| 9 | `test_audio_timestamp` | start_ms and duration_ms accuracy | Timestamps accurate within tolerance | +| 10 | `test_long_duration_stream` | **SKIPPED** — 5+ min stream | No timeout or connection drop | + +### Critical ASR Invariants + +1. **Result fields**: Every result must have `id`, `text`, `language`, `session_id` +2. **Finalize flow**: `asr_finalize` cmd -> `final=True` result -> `asr_finalize_end` response +3. **Error format**: `{id, module, code, message, vendor_info: {vendor, code, message}}` +4. **Metrics**: TTFW (Time To First Word) > 0, TTLW (Time To Last Word) > TTFW + +### Required ASR Config Files + +``` +property_en.json # Valid key + language: "en-US" +property_zh.json # Valid key + language: "zh-CN" +property_invalid.json # key: "invalid" (triggers vendor error test) +property_dump.json # Valid key + dump: true +``` + +**Template** (`property_en.json` for Deepgram): +```json +{ + "params": { + "key": "${env:DEEPGRAM_API_KEY}", + "model": "nova-2", + "sample_rate": 16000, + "encoding": "linear16", + "language": "en-US" + } +} +``` + +--- + +## Guarder Test Framework Internals + +### Manifest Template System + +Both guarders use template manifests with `{{extension_name}}` placeholders: + +```json +{ + "type": "app", + "name": "tts_guarder", + "version": "0.1.0", + "dependencies": [ + {"path": "../../ten_packages/extension/{{extension_name}}"} + ] +} +``` + +The Taskfile substitutes this at runtime with `sed`. + +### conftest.py Pattern + +Both guarders use a session-scoped FakeApp: + +```python +@pytest.fixture(scope="session", autouse=True) +def global_setup_and_teardown(): + event = threading.Event() + fake_app_ctx = FakeAppCtx(event) + fake_app_thread = threading.Thread(target=run_fake_app, args=(fake_app_ctx,)) + fake_app_thread.start() + event.wait() + yield + fake_app_ctx.fake_app.close() + fake_app_thread.join() +``` + +Each test creates its own `ExtensionTester` within this shared app context. +Tests share the session-scoped app but get fresh extension instances. + +### Pytest Options + +- `--extension_name` — extension to test (required) +- `--config_dir` — path to configs directory (required) +- `--enable_sample_rate` — "True"/"False" for sample rate comparison (TTS only) + +--- + +## Common Test Failures and Fixes + +### "Timeout waiting for audio" +- **Cause**: External API not responding within timeout +- **Fix**: Check API key is valid, check network, increase timeout if needed +- **Note**: Some flakiness is expected with external APIs — run individually to confirm + +### "Received error data" / FATAL ERROR +- **Cause**: Extension detected invalid config and raised error (this is correct behavior for error tests) +- **Fix**: If this happens on non-error tests, check your config files have valid API keys + +### "Found N dump files, expected M" +- **Cause**: Some requests timed out and didn't produce dump files +- **Fix**: Usually API timeout flakiness — rerun the test + +### "Received additional data after flush_end" +- **Cause**: Extension sent audio data after it should have stopped +- **Fix**: Ensure your cancel_tts/flush handling stops all pending output immediately + +### "Test failed: sample rates are the same" +- **Cause**: Your extension ignores the sample_rate config +- **Fix**: Implement sample_rate support, or set ENABLE_SAMPLE_RATE=False if your API doesn't support it + +### Import errors +- **Cause**: PYTHONPATH doesn't include ten_runtime_python and ten_ai_base +- **Fix**: Check `tests/bin/start` script sets PYTHONPATH correctly + +### "ModuleNotFoundError: No module named 'ten_packages.extension.xxx'" +- **Cause**: Extension not installed in test environment +- **Fix**: Run `tman install --standalone` in extension directory, or use `task test-extension` (does it automatically) + +--- + +## CI/CD Pipeline + +### Manual Guarder Tests (GitHub Actions) + +ASR and TTS guarder tests can be triggered manually: + +- Workflow: `.github/workflows/manual_test_asr_guarder.yml` +- Inputs: `extension` name, `config_dir`, `branch`, `env_vars` (semicolon-separated secret names) +- API keys loaded from GitHub Secrets at runtime + +### Extension Publishing + +- Workflow: `.github/workflows/manual_publish_extension.yml` +- Steps: `tman install --standalone` -> `tman run build` -> `tman publish` +- Requires `TEN_CLOUD_STORE` secret for publishing + +--- + +## See Also + +- [Extension Development](extension_development.md) — Config files and pre-submission checklist +- [Back to Workflows](../05_workflows.md) From a92239ccf4ac0890e283d551a4a8d4a38cb6962d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 12:39:50 +0000 Subject: [PATCH 02/18] refactor: rewrite deepgram tts client with duplex websocket pattern MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rewrite DeepgramTTSClient with separate send and receive async tasks on a single websocket, matching the cartesia_tts architecture. this replaces the serial send-then-receive pattern that caused state leaks between interleaved requests. key changes: - _send_loop(): reads from _text_queue, sends Speak+Flush to WS - _receive_loop(): reads from WS, puts events into _output_queue - _connection_loop(): auto-reconnect with exponential backoff - cancel drops audio in receive loop, Flushed always signals END - update docs/ai gotchas with deployment lessons learned test results unchanged at 14/16 guarder passed: - test_interleaved_requests: still fails — request 8/8 gets timeout because output queue has stale END from cancelled request. needs per-request-id event routing (next iteration). - test_subtitle_alignment: feature gap (no word-level timing) --- .../extension/deepgram_tts/deepgram_tts.py | 512 +++++++++++------- 1 file changed, 323 insertions(+), 189 deletions(-) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py index 1564afcc2d..855f33148a 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py @@ -16,13 +16,18 @@ from ten_runtime import AsyncTenEnv from ten_ai_base.const import LOG_CATEGORY_VENDOR -# Custom event types to communicate status back to the extension +# Event types for the output queue EVENT_TTS_RESPONSE = 1 EVENT_TTS_END = 2 EVENT_TTS_ERROR = 3 EVENT_TTS_FLUSH = 4 EVENT_TTS_TTFB_METRIC = 5 +MAX_RETRY_TIMES = 5 + +# Sentinel to signal the send loop to stop +_SEND_STOP = None + class DeepgramTTSConnectionException(Exception): """Exception raised when Deepgram TTS connection fails""" @@ -31,11 +36,18 @@ def __init__(self, status_code: int, body: str): self.status_code = status_code self.body = body super().__init__( - f"Deepgram TTS connection failed (code: {status_code}): {body}" + f"Deepgram TTS connection failed " f"(code: {status_code}): {body}" ) class DeepgramTTSClient: + """Duplex WebSocket client for Deepgram TTS. + + Uses separate send and receive tasks on a single WebSocket + connection. Text goes into _text_queue via send_text(), + audio/events come out of _output_queue via get(). + """ + def __init__( self, config: DeepgramTTSConfig, @@ -44,47 +56,247 @@ def __init__( send_non_fatal_tts_error: Callable[[str], asyncio.Future] | None = None, ): self.config = config - self.ten_env: AsyncTenEnv = ten_env - self._is_cancelled = False - self.ws: ClientConnection | None = None + self.ten_env = ten_env self.send_fatal_tts_error = send_fatal_tts_error self.send_non_fatal_tts_error = send_non_fatal_tts_error - self.sent_ts: datetime | None = None - self.ttfb_sent: bool = False + self._ws: ClientConnection | None = None + self._closing = False + self._is_cancelled = False + + # Duplex queues + self._text_queue: asyncio.Queue[str | None] = asyncio.Queue() + self._output_queue: asyncio.Queue[tuple[bytes | int | None, int]] = ( + asyncio.Queue() + ) + + # Background tasks + self._connection_task: asyncio.Task | None = None + self._channel_tasks: list[asyncio.Task] = [] + self._connect_failures = 0 + + # TTFB tracking + self._sent_ts: datetime | None = None + self._ttfb_sent: bool = False - # Build WebSocket URL with query parameters - self.ws_url = self._build_ws_url() + self._ws_url = self._build_ws_url() def _build_ws_url(self) -> str: - """Build the WebSocket URL with query parameters""" base = self.config.base_url - params = f"model={self.config.model}&encoding={self.config.encoding}&sample_rate={self.config.sample_rate}" + params = ( + f"model={self.config.model}" + f"&encoding={self.config.encoding}" + f"&sample_rate={self.config.sample_rate}" + ) return f"{base}?{params}" + # ── Lifecycle ──────────────────────────────────────────────── + async def start(self) -> None: - """Preheating: establish websocket connection during initialization""" - try: - await self._connect() + """Start client: connect and launch send/receive loops.""" + self._closing = False + self._connection_task = asyncio.create_task(self._connection_loop()) + # Wait briefly for connection to establish + await asyncio.sleep(0.1) + + async def stop(self) -> None: + """Stop client: close connection and cancel tasks.""" + self._closing = True + self._is_cancelled = True - except Exception as e: - self.ten_env.log_error(f"Deepgram TTS preheat failed: {e}") + # Signal send loop to exit + await self._text_queue.put(_SEND_STOP) + + # Cancel channel tasks + for task in self._channel_tasks: + task.cancel() + self._channel_tasks.clear() + + if self._connection_task: + self._connection_task.cancel() + try: + await self._connection_task + except asyncio.CancelledError: + pass + self._connection_task = None + + # Signal any consumer waiting on output_queue + await self._output_queue.put((None, EVENT_TTS_END)) + + if self._ws: + try: + await self._ws.send(json.dumps({"type": "Close"})) + except Exception: + pass + try: + await self._ws.close() + except Exception: + pass + self._ws = None + + async def cancel(self) -> None: + """Cancel current TTS request.""" + self.ten_env.log_debug("Cancelling current TTS task.") + self._is_cancelled = True + self.reset_ttfb() + # Send Flush to Deepgram to stop audio generation + if self._ws: + try: + await self._ws.send(json.dumps({"type": "Flush"})) + except Exception: + pass + + def reset_ttfb(self) -> None: + self._sent_ts = None + self._ttfb_sent = False + + # ── Public interface for extension ─────────────────────────── + + async def send_text(self, text: str) -> None: + """Queue text for sending to Deepgram.""" + await self._text_queue.put(text) + + async def get( + self, text: str + ) -> AsyncIterator[tuple[bytes | int | None, int]]: + """Send text and yield audio events. + + For empty text, immediately yields EVENT_TTS_END. + Otherwise sends text to the send loop and reads + events from the output queue until END or ERROR. + """ + if len(text.strip()) == 0: + self.ten_env.log_warn("DeepgramTTS: empty text, returning END") + yield None, EVENT_TTS_END + return + + self._is_cancelled = False + + # Track TTFB from when we send + if not self._ttfb_sent: + self._sent_ts = datetime.now() + + # Put text into send queue + await self._text_queue.put(text) + + # Read events from output queue + while True: + try: + data_msg, event = await asyncio.wait_for( + self._output_queue.get(), timeout=5.0 + ) + except asyncio.TimeoutError: + self.ten_env.log_error("Timeout waiting for Deepgram audio") + yield ( + b"Timeout waiting for Deepgram audio", + EVENT_TTS_ERROR, + ) + break + + if event == EVENT_TTS_END: + yield None, EVENT_TTS_END + break + elif event == EVENT_TTS_ERROR: + yield data_msg, EVENT_TTS_ERROR + break + else: + yield data_msg, event + + # ── Connection loop with auto-reconnect ───────────────────── + + async def _connection_loop(self) -> None: + min_delay = 0.1 + max_delay = 3.0 + + while not self._closing: + try: + await self._connect() + self._connect_failures = 0 + + if self._closing: + return + + # Launch duplex tasks + self._channel_tasks = [ + asyncio.create_task(self._send_loop()), + asyncio.create_task(self._receive_loop()), + ] + + # Wait for either to finish + done, pending = await asyncio.wait( + self._channel_tasks, + return_when=asyncio.FIRST_COMPLETED, + ) + + for task in pending: + task.cancel() + self._channel_tasks.clear() + + for task in done: + exc = task.exception() + if exc and not isinstance(exc, asyncio.CancelledError): + self.ten_env.log_warn( + f"Channel task exception: {exc}", + category=LOG_CATEGORY_VENDOR, + ) + + except DeepgramTTSConnectionException: + raise + + except asyncio.CancelledError: + return + + except Exception as e: + self.ten_env.log_warn( + f"vendor_status: connection error: {e}", + category=LOG_CATEGORY_VENDOR, + ) + + finally: + if self._ws: + try: + await self._ws.close() + except Exception: + pass + self._ws = None + + if self._closing: + return + + self._connect_failures += 1 + if self._connect_failures > MAX_RETRY_TIMES: + self.ten_env.log_error( + f"Max retries ({MAX_RETRY_TIMES}) " f"exceeded", + category=LOG_CATEGORY_VENDOR, + ) + return + + delay = min( + min_delay * (2 ** (self._connect_failures - 1)), + max_delay, + ) + self.ten_env.log_debug( + f"vendor_status: reconnecting in " + f"{delay:.1f}s " + f"(attempt {self._connect_failures}" + f"/{MAX_RETRY_TIMES})", + category=LOG_CATEGORY_VENDOR, + ) + await asyncio.sleep(delay) async def _connect(self) -> None: - """Connect to the websocket""" try: extra_headers = { "Authorization": f"Token {self.config.api_key}", } - self.ws = await websockets.connect( - self.ws_url, + self._ws = await websockets.connect( + self._ws_url, additional_headers=extra_headers, ) self.ten_env.log_debug( "vendor_status: connected to deepgram tts", category=LOG_CATEGORY_VENDOR, ) - except Exception as e: error_message = str(e) if "401" in error_message or "Unauthorized" in error_message: @@ -95,204 +307,126 @@ async def _connect(self) -> None: status_code=401, body=error_message ) from e else: - self.ten_env.log_error( - f"Deepgram TTS preheat failed, unexpected error: {e}" - ) + self.ten_env.log_error(f"Deepgram TTS connection failed: {e}") if self.send_non_fatal_tts_error: await self.send_non_fatal_tts_error( error_message=error_message ) raise - async def stop(self): - # Set cancellation flag first to stop any pending operations - self._is_cancelled = True - - # Stop the websocket connection if it exists - if self.ws: - try: - # Send close message - await self.ws.send(json.dumps({"type": "Close"})) - except Exception: - pass - await self.ws.close() - self.ws = None + # ── Send loop ─────────────────────────────────────────────── - async def cancel(self): - """ - Cancel the current TTS task. - """ - self.ten_env.log_debug("Cancelling current TTS task.") - self._is_cancelled = True - if self.ws: - self.reset_ttfb() - # Send flush to clear any pending audio - try: - await self.ws.send(json.dumps({"type": "Flush"})) - except Exception: - pass + async def _send_loop(self) -> None: + """Read text from queue and send Speak+Flush to WS.""" + try: + while not self._closing: + text = await self._text_queue.get() + if text is _SEND_STOP: + return + + if not self._ws: + self.ten_env.log_error("WS not connected in send loop") + return + + self.ten_env.log_debug( + f"send_text: {text[:80]}", + category=LOG_CATEGORY_VENDOR, + ) - async def reconnect(self): - """Close and re-establish the websocket connection.""" - if self.ws: - try: - await self.ws.close() - except Exception: - pass - self.ws = None - await self._connect() + speak_msg = {"type": "Speak", "text": text} + await self._ws.send(json.dumps(speak_msg)) + await self._ws.send(json.dumps({"type": "Flush"})) - def reset_ttfb(self): - self.sent_ts = None - self.ttfb_sent = False + except asyncio.CancelledError: + return + except Exception as e: + self.ten_env.log_error( + f"vendor_error: send_loop error: {e}", + category=LOG_CATEGORY_VENDOR, + ) + raise - async def get( - self, text: str - ) -> AsyncIterator[tuple[bytes | int | None, int | None]]: - """Generate TTS audio for the given text""" + # ── Receive loop ──────────────────────────────────────────── - if len(text.strip()) == 0: - self.ten_env.log_warn( - "DeepgramTTS: empty text provided, " "returning END event" - ) - yield None, EVENT_TTS_END + async def _receive_loop(self) -> None: + """Read from WS and dispatch to output queue.""" + if not self._ws: return - self._is_cancelled = False try: - await self._ensure_connection() - async for audio_chunk, event_status in self._process_single_tts( - text - ): - if event_status == EVENT_TTS_FLUSH: - # Cancelled: reconnect for clean state - await self.reconnect() - break + async for message in self._ws: + if self._closing: + return - yield audio_chunk, event_status + if isinstance(message, bytes): + await self._handle_audio(message) + else: + await self._handle_text_message(message) + except asyncio.CancelledError: + return + except websockets.exceptions.ConnectionClosed: + self.ten_env.log_warn( + "vendor_status: WS closed by server", + category=LOG_CATEGORY_VENDOR, + ) except Exception as e: self.ten_env.log_error( - f"vendor_error: {e}", + f"vendor_error: receive_loop: {e}", category=LOG_CATEGORY_VENDOR, ) raise - async def _ensure_connection(self) -> None: - """Ensure websocket connection is established""" - if not self.ws: - await self._connect() - - async def _process_single_tts( - self, text: str - ) -> AsyncIterator[tuple[bytes | int | None, int | None]]: - """Process a single TTS request""" - if not self.ws: - self.ten_env.log_error("Deepgram websocket not connected") + async def _handle_audio(self, data: bytes) -> None: + """Handle binary audio message from WS.""" + if self._is_cancelled: + self.ten_env.log_debug("Dropping audio chunk (cancelled)") return - self.ten_env.log_debug(f"process_single_tts, text: {text}") - - if not self.ttfb_sent: - self.sent_ts = datetime.now() - - # Send the text to Deepgram - speak_msg = { - "type": "Speak", - "text": text, - } - await self.ws.send(json.dumps(speak_msg)) + # TTFB on first audio chunk + if self._sent_ts and not self._ttfb_sent: + ttfb_ms = int( + (datetime.now() - self._sent_ts).total_seconds() * 1000 + ) + await self._output_queue.put((ttfb_ms, EVENT_TTS_TTFB_METRIC)) + self._ttfb_sent = True - # Send flush to get audio immediately - await self.ws.send(json.dumps({"type": "Flush"})) + self.ten_env.log_debug( + f"DeepgramTTS: audio chunk, " f"length: {len(data)}" + ) + await self._output_queue.put((data, EVENT_TTS_RESPONSE)) + async def _handle_text_message(self, raw: str) -> None: + """Handle JSON text message from WS.""" try: - # Receive audio data - while True: - if self._is_cancelled: - self.ten_env.log_debug( - "Cancellation flag detected, stopping TTS stream." - ) - yield None, EVENT_TTS_FLUSH - break - - try: - message = await asyncio.wait_for( - self.ws.recv(), timeout=5.0 - ) - except asyncio.TimeoutError: - self.ten_env.log_error( - "Timeout waiting for Deepgram audio - yielding error" - ) - yield b"Timeout waiting for Deepgram audio", EVENT_TTS_ERROR - break + data = json.loads(raw) + except json.JSONDecodeError: + self.ten_env.log_warn(f"Failed to parse message: {raw}") + return - # Binary message = audio data - if isinstance(message, bytes): - # Drop audio if cancelled during recv - if self._is_cancelled: - self.ten_env.log_debug( - "Cancellation detected after recv, " - "dropping audio chunk." - ) - yield None, EVENT_TTS_FLUSH - break - - # First audio chunk, calculate TTFB - if self.sent_ts and not self.ttfb_sent: - ttfb_ms = int( - (datetime.now() - self.sent_ts).total_seconds() - * 1000 - ) - yield ttfb_ms, EVENT_TTS_TTFB_METRIC - self.ttfb_sent = True + msg_type = data.get("type", "") - self.ten_env.log_debug( - f"DeepgramTTS: sending EVENT_TTS_RESPONSE, " - f"length: {len(message)}" - ) - yield message, EVENT_TTS_RESPONSE - - # Text message = JSON metadata - else: - try: - data = json.loads(message) - msg_type = data.get("type", "") - - if msg_type == "Flushed": - # All audio for this text has been sent - self.ten_env.log_debug( - "DeepgramTTS: received Flushed, " - "sending EVENT_TTS_END" - ) - yield None, EVENT_TTS_END - break - - elif msg_type == "Warning": - self.ten_env.log_warn( - f"Deepgram warning: {data.get('warn_msg', '')}" - ) - - elif msg_type == "Error": - error_msg = data.get("err_msg", "Unknown error") - self.ten_env.log_error( - f"Deepgram error: {error_msg}" - ) - yield error_msg.encode("utf-8"), EVENT_TTS_ERROR - break - - except json.JSONDecodeError: - self.ten_env.log_warn( - f"Failed to parse Deepgram message: {message}" - ) + if msg_type == "Flushed": + self.ten_env.log_debug("DeepgramTTS: Flushed received") + # Always signal END so get() returns promptly + # (even after cancel — the extension checks + # cancel state separately) + await self._output_queue.put((None, EVENT_TTS_END)) - if not self._is_cancelled: - self.ten_env.log_debug("DeepgramTTS: TTS complete") + elif msg_type == "Warning": + self.ten_env.log_warn( + f"Deepgram warning: " f"{data.get('warn_msg', '')}" + ) - except Exception as e: - error_message = str(e) - self.ten_env.log_error( - f"vendor_error: {error_message}", - category=LOG_CATEGORY_VENDOR, + elif msg_type == "Error": + error_msg = data.get("err_msg", "Unknown error") + self.ten_env.log_error(f"Deepgram error: {error_msg}") + await self._output_queue.put( + ( + error_msg.encode("utf-8"), + EVENT_TTS_ERROR, + ) ) - yield error_message.encode("utf-8"), EVENT_TTS_ERROR + + else: + self.ten_env.log_debug(f"Unknown message type: {msg_type}") From 27c71be46798940e615d379077ca94388e195548 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 13:00:52 +0000 Subject: [PATCH 03/18] fix: reconnect websocket per request_id to fix interleaved requests revert from duplex pattern to clean serial model with key improvement: reconnect websocket when request_id changes. this prevents deepgram's connection from going stale after many rapid Speak+Flush cycles. cancel() now drains until Flushed before returning so the connection is clean for subsequent requests. mark_needs_reconnect() called by extension on request_id change triggers fresh connection. test_interleaved_requests now passes (was timing out on request 8/8 because deepgram stopped responding on a long-lived connection). --- .../extension/deepgram_tts/deepgram_tts.py | 417 ++++++------------ .../extension/deepgram_tts/extension.py | 3 + .../deepgram_tts/tests/test_basic.py | 1 - 3 files changed, 139 insertions(+), 282 deletions(-) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py index 855f33148a..9f532bedef 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py @@ -16,18 +16,12 @@ from ten_runtime import AsyncTenEnv from ten_ai_base.const import LOG_CATEGORY_VENDOR -# Event types for the output queue +# Event types communicated back to the extension EVENT_TTS_RESPONSE = 1 EVENT_TTS_END = 2 EVENT_TTS_ERROR = 3 -EVENT_TTS_FLUSH = 4 EVENT_TTS_TTFB_METRIC = 5 -MAX_RETRY_TIMES = 5 - -# Sentinel to signal the send loop to stop -_SEND_STOP = None - class DeepgramTTSConnectionException(Exception): """Exception raised when Deepgram TTS connection fails""" @@ -41,11 +35,11 @@ def __init__(self, status_code: int, body: str): class DeepgramTTSClient: - """Duplex WebSocket client for Deepgram TTS. + """WebSocket client for Deepgram TTS. - Uses separate send and receive tasks on a single WebSocket - connection. Text goes into _text_queue via send_text(), - audio/events come out of _output_queue via get(). + Each get() call sends Speak+Flush and streams audio + until Flushed. Connection is reused across calls but + reconnected when needed (cancel, error, new request). """ def __init__( @@ -61,19 +55,8 @@ def __init__( self.send_non_fatal_tts_error = send_non_fatal_tts_error self._ws: ClientConnection | None = None - self._closing = False self._is_cancelled = False - - # Duplex queues - self._text_queue: asyncio.Queue[str | None] = asyncio.Queue() - self._output_queue: asyncio.Queue[tuple[bytes | int | None, int]] = ( - asyncio.Queue() - ) - - # Background tasks - self._connection_task: asyncio.Task | None = None - self._channel_tasks: list[asyncio.Task] = [] - self._connect_failures = 0 + self._needs_reconnect = False # TTFB tracking self._sent_ts: datetime | None = None @@ -90,39 +73,15 @@ def _build_ws_url(self) -> str: ) return f"{base}?{params}" - # ── Lifecycle ──────────────────────────────────────────────── - async def start(self) -> None: - """Start client: connect and launch send/receive loops.""" - self._closing = False - self._connection_task = asyncio.create_task(self._connection_loop()) - # Wait briefly for connection to establish - await asyncio.sleep(0.1) + """Preheat: establish initial connection.""" + try: + await self._connect() + except Exception as e: + self.ten_env.log_error(f"Deepgram TTS preheat failed: {e}") async def stop(self) -> None: - """Stop client: close connection and cancel tasks.""" - self._closing = True self._is_cancelled = True - - # Signal send loop to exit - await self._text_queue.put(_SEND_STOP) - - # Cancel channel tasks - for task in self._channel_tasks: - task.cancel() - self._channel_tasks.clear() - - if self._connection_task: - self._connection_task.cancel() - try: - await self._connection_task - except asyncio.CancelledError: - pass - self._connection_task = None - - # Signal any consumer waiting on output_queue - await self._output_queue.put((None, EVENT_TTS_END)) - if self._ws: try: await self._ws.send(json.dumps({"type": "Close"})) @@ -135,36 +94,50 @@ async def stop(self) -> None: self._ws = None async def cancel(self) -> None: - """Cancel current TTS request.""" + """Cancel current TTS. + + Sends Flush and drains until Flushed so the + connection is clean for the next request. + """ self.ten_env.log_debug("Cancelling current TTS task.") self._is_cancelled = True self.reset_ttfb() - # Send Flush to Deepgram to stop audio generation if self._ws: try: await self._ws.send(json.dumps({"type": "Flush"})) - except Exception: - pass + # Drain until Flushed to leave connection clean + await asyncio.wait_for(self._drain_until_flushed(), timeout=3.0) + except (asyncio.TimeoutError, Exception) as e: + self.ten_env.log_warn( + f"Cancel drain failed: {e}, " + "will reconnect on next request" + ) + self._needs_reconnect = True + + async def _drain_until_flushed(self) -> None: + """Read and discard WS messages until Flushed.""" + while self._ws: + msg = await self._ws.recv() + if isinstance(msg, str): + try: + data = json.loads(msg) + if data.get("type") == "Flushed": + return + except json.JSONDecodeError: + pass def reset_ttfb(self) -> None: self._sent_ts = None self._ttfb_sent = False - # ── Public interface for extension ─────────────────────────── - - async def send_text(self, text: str) -> None: - """Queue text for sending to Deepgram.""" - await self._text_queue.put(text) + def mark_needs_reconnect(self) -> None: + """Called by extension when request_id changes.""" + self._needs_reconnect = True async def get( self, text: str ) -> AsyncIterator[tuple[bytes | int | None, int]]: - """Send text and yield audio events. - - For empty text, immediately yields EVENT_TTS_END. - Otherwise sends text to the send loop and reads - events from the output queue until END or ERROR. - """ + """Send text and yield audio events.""" if len(text.strip()) == 0: self.ten_env.log_warn("DeepgramTTS: empty text, returning END") yield None, EVENT_TTS_END @@ -172,117 +145,102 @@ async def get( self._is_cancelled = False - # Track TTFB from when we send - if not self._ttfb_sent: - self._sent_ts = datetime.now() - - # Put text into send queue - await self._text_queue.put(text) + # Reconnect if needed (new request_id or after error) + if self._needs_reconnect: + await self._reconnect() + self._needs_reconnect = False - # Read events from output queue - while True: - try: - data_msg, event = await asyncio.wait_for( - self._output_queue.get(), timeout=5.0 - ) - except asyncio.TimeoutError: - self.ten_env.log_error("Timeout waiting for Deepgram audio") - yield ( - b"Timeout waiting for Deepgram audio", - EVENT_TTS_ERROR, - ) - break - - if event == EVENT_TTS_END: - yield None, EVENT_TTS_END - break - elif event == EVENT_TTS_ERROR: - yield data_msg, EVENT_TTS_ERROR - break - else: - yield data_msg, event - - # ── Connection loop with auto-reconnect ───────────────────── + await self._ensure_connection() - async def _connection_loop(self) -> None: - min_delay = 0.1 - max_delay = 3.0 + if not self._ttfb_sent: + self._sent_ts = datetime.now() - while not self._closing: - try: - await self._connect() - self._connect_failures = 0 - - if self._closing: - return - - # Launch duplex tasks - self._channel_tasks = [ - asyncio.create_task(self._send_loop()), - asyncio.create_task(self._receive_loop()), - ] - - # Wait for either to finish - done, pending = await asyncio.wait( - self._channel_tasks, - return_when=asyncio.FIRST_COMPLETED, - ) + # Send Speak + Flush + speak_msg = {"type": "Speak", "text": text} + await self._ws.send(json.dumps(speak_msg)) + await self._ws.send(json.dumps({"type": "Flush"})) - for task in pending: - task.cancel() - self._channel_tasks.clear() + # Receive audio until Flushed + try: + while True: + if self._is_cancelled: + self.ten_env.log_debug("Cancelled, stopping stream.") + break + + try: + message = await asyncio.wait_for( + self._ws.recv(), timeout=8.0 + ) + except asyncio.TimeoutError: + self.ten_env.log_error("Timeout waiting for Deepgram audio") + self._needs_reconnect = True + yield ( + b"Timeout waiting for Deepgram audio", + EVENT_TTS_ERROR, + ) + break - for task in done: - exc = task.exception() - if exc and not isinstance(exc, asyncio.CancelledError): - self.ten_env.log_warn( - f"Channel task exception: {exc}", - category=LOG_CATEGORY_VENDOR, + if isinstance(message, bytes): + if self._is_cancelled: + self.ten_env.log_debug("Dropping audio (cancelled)") + break + + # TTFB on first audio chunk + if self._sent_ts and not self._ttfb_sent: + ttfb_ms = int( + (datetime.now() - self._sent_ts).total_seconds() + * 1000 ) + yield ttfb_ms, EVENT_TTS_TTFB_METRIC + self._ttfb_sent = True - except DeepgramTTSConnectionException: - raise - - except asyncio.CancelledError: - return - - except Exception as e: - self.ten_env.log_warn( - f"vendor_status: connection error: {e}", - category=LOG_CATEGORY_VENDOR, - ) - - finally: - if self._ws: + self.ten_env.log_debug( + f"DeepgramTTS: audio chunk, " f"length: {len(message)}" + ) + yield message, EVENT_TTS_RESPONSE + else: try: - await self._ws.close() - except Exception: - pass - self._ws = None - - if self._closing: - return - - self._connect_failures += 1 - if self._connect_failures > MAX_RETRY_TIMES: - self.ten_env.log_error( - f"Max retries ({MAX_RETRY_TIMES}) " f"exceeded", - category=LOG_CATEGORY_VENDOR, - ) - return + data = json.loads(message) + msg_type = data.get("type", "") + + if msg_type == "Flushed": + self.ten_env.log_debug("DeepgramTTS: Flushed") + yield None, EVENT_TTS_END + break + + elif msg_type == "Warning": + self.ten_env.log_warn( + f"Deepgram warning: " + f"{data.get('warn_msg', '')}" + ) + + elif msg_type == "Error": + error_msg = data.get("err_msg", "Unknown error") + self.ten_env.log_error( + f"Deepgram error: {error_msg}" + ) + yield ( + error_msg.encode("utf-8"), + EVENT_TTS_ERROR, + ) + break + + except json.JSONDecodeError: + self.ten_env.log_warn(f"Failed to parse: {message}") + + if not self._is_cancelled: + self.ten_env.log_debug("DeepgramTTS: complete") - delay = min( - min_delay * (2 ** (self._connect_failures - 1)), - max_delay, - ) - self.ten_env.log_debug( - f"vendor_status: reconnecting in " - f"{delay:.1f}s " - f"(attempt {self._connect_failures}" - f"/{MAX_RETRY_TIMES})", + except Exception as e: + self.ten_env.log_error( + f"vendor_error: {e}", category=LOG_CATEGORY_VENDOR, ) - await asyncio.sleep(delay) + self._needs_reconnect = True + yield ( + str(e).encode("utf-8"), + EVENT_TTS_ERROR, + ) async def _connect(self) -> None: try: @@ -314,119 +272,16 @@ async def _connect(self) -> None: ) raise - # ── Send loop ─────────────────────────────────────────────── - - async def _send_loop(self) -> None: - """Read text from queue and send Speak+Flush to WS.""" - try: - while not self._closing: - text = await self._text_queue.get() - if text is _SEND_STOP: - return - - if not self._ws: - self.ten_env.log_error("WS not connected in send loop") - return - - self.ten_env.log_debug( - f"send_text: {text[:80]}", - category=LOG_CATEGORY_VENDOR, - ) - - speak_msg = {"type": "Speak", "text": text} - await self._ws.send(json.dumps(speak_msg)) - await self._ws.send(json.dumps({"type": "Flush"})) - - except asyncio.CancelledError: - return - except Exception as e: - self.ten_env.log_error( - f"vendor_error: send_loop error: {e}", - category=LOG_CATEGORY_VENDOR, - ) - raise - - # ── Receive loop ──────────────────────────────────────────── - - async def _receive_loop(self) -> None: - """Read from WS and dispatch to output queue.""" + async def _ensure_connection(self) -> None: if not self._ws: - return + await self._connect() - try: - async for message in self._ws: - if self._closing: - return - - if isinstance(message, bytes): - await self._handle_audio(message) - else: - await self._handle_text_message(message) - - except asyncio.CancelledError: - return - except websockets.exceptions.ConnectionClosed: - self.ten_env.log_warn( - "vendor_status: WS closed by server", - category=LOG_CATEGORY_VENDOR, - ) - except Exception as e: - self.ten_env.log_error( - f"vendor_error: receive_loop: {e}", - category=LOG_CATEGORY_VENDOR, - ) - raise - - async def _handle_audio(self, data: bytes) -> None: - """Handle binary audio message from WS.""" - if self._is_cancelled: - self.ten_env.log_debug("Dropping audio chunk (cancelled)") - return - - # TTFB on first audio chunk - if self._sent_ts and not self._ttfb_sent: - ttfb_ms = int( - (datetime.now() - self._sent_ts).total_seconds() * 1000 - ) - await self._output_queue.put((ttfb_ms, EVENT_TTS_TTFB_METRIC)) - self._ttfb_sent = True - - self.ten_env.log_debug( - f"DeepgramTTS: audio chunk, " f"length: {len(data)}" - ) - await self._output_queue.put((data, EVENT_TTS_RESPONSE)) - - async def _handle_text_message(self, raw: str) -> None: - """Handle JSON text message from WS.""" - try: - data = json.loads(raw) - except json.JSONDecodeError: - self.ten_env.log_warn(f"Failed to parse message: {raw}") - return - - msg_type = data.get("type", "") - - if msg_type == "Flushed": - self.ten_env.log_debug("DeepgramTTS: Flushed received") - # Always signal END so get() returns promptly - # (even after cancel — the extension checks - # cancel state separately) - await self._output_queue.put((None, EVENT_TTS_END)) - - elif msg_type == "Warning": - self.ten_env.log_warn( - f"Deepgram warning: " f"{data.get('warn_msg', '')}" - ) - - elif msg_type == "Error": - error_msg = data.get("err_msg", "Unknown error") - self.ten_env.log_error(f"Deepgram error: {error_msg}") - await self._output_queue.put( - ( - error_msg.encode("utf-8"), - EVENT_TTS_ERROR, - ) - ) - - else: - self.ten_env.log_debug(f"Unknown message type: {msg_type}") + async def _reconnect(self) -> None: + """Close and re-establish the connection.""" + if self._ws: + try: + await self._ws.close() + except Exception: + pass + self._ws = None + await self._connect() diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py index 749f70f4eb..eee5286fdc 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py @@ -220,6 +220,9 @@ async def request_tts(self, t: TTSTextInput) -> None: ) if self.client: self.client.reset_ttfb() + if self.current_request_id is not None: + # Fresh connection for new request_id + self.client.mark_needs_reconnect() self.current_request_id = t.request_id self.current_request_finished = False self.total_audio_bytes = 0 diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py index d47e898a16..e612e4f7e8 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py @@ -29,7 +29,6 @@ from deepgram_tts.deepgram_tts import ( EVENT_TTS_RESPONSE, EVENT_TTS_END, - EVENT_TTS_FLUSH, EVENT_TTS_TTFB_METRIC, ) From cefa2f306d72d270d2936a4f268fad0a8ecc76ad Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 13:14:41 +0000 Subject: [PATCH 04/18] fix: remove reconnect-per-request, rely on cancel drain instead reconnecting on every request_id change caused test_append_input_stress to timeout (100 requests = 100 reconnections). the cancel() drain is sufficient: it waits for Flushed before returning, keeping the connection clean for the next request. reconnect only on error/timeout. both test_interleaved_requests and test_append_input_stress now pass. --- .../agents/ten_packages/extension/deepgram_tts/extension.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py index eee5286fdc..749f70f4eb 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py @@ -220,9 +220,6 @@ async def request_tts(self, t: TTSTextInput) -> None: ) if self.client: self.client.reset_ttfb() - if self.current_request_id is not None: - # Fresh connection for new request_id - self.client.mark_needs_reconnect() self.current_request_id = t.request_id self.current_request_finished = False self.total_audio_bytes = 0 From 261949a188590a64958826313d22cda9bde47b84 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 13:30:50 +0000 Subject: [PATCH 05/18] refactor: align progressive disclosure docs with PD standard L0 repo card: - remove descriptive blockquote - remove L2 section (L2 is reached via L1 links) - rename Type to Repo Type, use enum value distributed-system - add Description row to identity block - rename L1 Summaries to L1 Index with Audience column - update Last Reviewed to 2026-04-07 07_gotchas.md: - cut from 236 to 117 lines (under 200-line ceiling) - remove full restart recipe and operational runbook material - keep actual gotchas: property tuples, signal handlers, task run, zombies, .env, next.js lock, tman wipe, graph cache, port 3000 - add pointer to new L2 deep dive new L2 deep dive operations_restarts.md: - full restart procedure with next-server kill - zombie worker cleanup - stale lock and cache cleanup - port 3000 conflict debugging with /proc forensics - .env and container restart recovery - docker cp extension code workflow - after-container-restart checklist cross-links: - add operations_restarts to deep_dives/_index.md - add to related deep dives in 07_gotchas.md and 05_workflows.md - trim 05_workflows.md docker cp section to pointer --- docs/ai/L0_repo_card.md | 33 ++-- docs/ai/L1/05_workflows.md | 21 +-- docs/ai/L1/07_gotchas.md | 182 ++++--------------- docs/ai/L1/deep_dives/_index.md | 1 + docs/ai/L1/deep_dives/operations_restarts.md | 171 +++++++++++++++++ 5 files changed, 221 insertions(+), 187 deletions(-) create mode 100644 docs/ai/L1/deep_dives/operations_restarts.md diff --git a/docs/ai/L0_repo_card.md b/docs/ai/L0_repo_card.md index 53c8a56953..288312322b 100644 --- a/docs/ai/L0_repo_card.md +++ b/docs/ai/L0_repo_card.md @@ -1,31 +1,26 @@ # TEN Framework — Repo Card -> Open-source platform for building real-time multimodal AI agents with voice, video, and tool capabilities. - ## Identity | Field | Value | | ------------- | -------------------------------------------------------------------- | | Repo | `TEN-framework/TEN-Agent` | -| Type | `framework` (SDK-library + API-service + frontend) | +| Description | Open-source platform for building real-time multimodal AI agents | +| Repo Type | `distributed-system` | | Language | Python (extensions), Go (API server), TypeScript/React (playground) | | Deploy Target | Docker container (`ten_agent_dev`), Taskfile-based build | | Owner | TEN Framework team | -| Last Reviewed | 2026-04-02 | - -## L1 — Summaries - -| File | Purpose | -| ---------------------------------------- | -------------------------------------------------------- | -| [01_setup](L1/01_setup.md) | Docker, .env, ports, health checks, restart procedures | -| [02_architecture](L1/02_architecture.md) | Extensions, graphs, connections, RTC-first design | -| [03_code_map](L1/03_code_map.md) | Directory tree, key files, base classes, 93+ extensions | -| [04_conventions](L1/04_conventions.md) | Naming, Pydantic configs, params pattern, formatting | -| [05_workflows](L1/05_workflows.md) | Create extension, modify graph, test, restart, deploy | -| [06_interfaces](L1/06_interfaces.md) | REST API, connection schemas, base class abstract methods| -| [07_gotchas](L1/07_gotchas.md) | Property tuples, signal handlers, zombies, .env timing | -| [08_security](L1/08_security.md) | API keys, .env, sensitive logging, git hooks | +| Last Reviewed | 2026-04-07 | -## L2 — Deep Dives +## L1 Index -See [L1/deep_dives/_index.md](L1/deep_dives/_index.md) for extended guides referenced by L1 files. +| File | Purpose | Audience | +| ---------------------------------------- | -------------------------------------------------------- | -------- | +| [01_setup](L1/01_setup.md) | Docker, .env, ports, health checks, restart procedures | both | +| [02_architecture](L1/02_architecture.md) | Extensions, graphs, connections, RTC-first design | both | +| [03_code_map](L1/03_code_map.md) | Directory tree, key files, base classes, 93+ extensions | both | +| [04_conventions](L1/04_conventions.md) | Naming, Pydantic configs, params pattern, formatting | both | +| [05_workflows](L1/05_workflows.md) | Create extension, modify graph, test, restart, deploy | both | +| [06_interfaces](L1/06_interfaces.md) | REST API, connection schemas, base class abstract methods| both | +| [07_gotchas](L1/07_gotchas.md) | Property tuples, signal handlers, zombies, .env timing | both | +| [08_security](L1/08_security.md) | API keys, .env, sensitive logging, git hooks | both | diff --git a/docs/ai/L1/05_workflows.md b/docs/ai/L1/05_workflows.md index b8ad2b729b..acdc4b0b76 100644 --- a/docs/ai/L1/05_workflows.md +++ b/docs/ai/L1/05_workflows.md @@ -143,24 +143,8 @@ docker exec ten_agent_dev bash -c \ ## Update Extension Code in Running Container -When iterating on extension code locally: - -```bash -# Copy updated files into the container (use /. to avoid nested dirs) -sudo docker cp ./agents/ten_packages/extension/my_ext/. \ - ten_agent_dev:/app/agents/ten_packages/extension/my_ext/ - -# Verify symlink exists in the example's tenapp -sudo docker exec ten_agent_dev bash -c \ - "ls -la /app/agents/examples//tenapp/ten_packages/extension/my_ext" - -# If missing, create it manually -sudo docker exec ten_agent_dev bash -c \ - "ln -sf /app/agents/ten_packages/extension/my_ext \ - /app/agents/examples//tenapp/ten_packages/extension/my_ext" - -# Then nuclear restart -``` +See [Operations and Restarts](deep_dives/operations_restarts.md) for the full procedure +including `docker cp` syntax, symlink verification, and restart steps. ## Pre-Commit Checks @@ -179,3 +163,4 @@ Pre-commit hooks validate: API key patterns, Black formatting, conventional comm - [Extension Development](deep_dives/extension_development.md) — Full extension creation with code examples - [Graph Configuration](deep_dives/graph_configuration.md) — Connection wiring and routing patterns - [Testing](deep_dives/testing.md) — Test infrastructure, guarder tests, debugging +- [Operations and Restarts](deep_dives/operations_restarts.md) — Full restart procedures, recovery diff --git a/docs/ai/L1/07_gotchas.md b/docs/ai/L1/07_gotchas.md index e61011e489..15251d5f57 100644 --- a/docs/ai/L1/07_gotchas.md +++ b/docs/ai/L1/07_gotchas.md @@ -41,110 +41,42 @@ Never start the server with `./bin/api` or `./bin/main` directly. ## Zombie Worker Processes -Worker processes (`bin/main`) run on the **host machine**, not inside Docker. -They survive container restarts and server restarts. - -```bash -# Check for zombies -ps -elf | grep 'bin/main' | grep -v grep - -# Kill them -ps -elf | grep 'bin/main' | grep -v grep | awk '{print $4}' | xargs -r sudo kill -9 -``` - -Always kill zombies before restarting the server. +Worker processes (`bin/main`) can survive container and server restarts. +Always check for and kill zombies before restarting. ## .env Loaded at Container Startup Only -Editing `.env` while the container is running has **no effect**. You must: - -```bash -cd /home/ubuntu/ten-framework/ai_agents -docker compose down && docker compose up -d -# Then reinstall Python deps and task run -``` - -## Node.js Version for Playground - -Playground requires Node.js >= 20.9.0. The host machine may have an older version. -Always run playground from **inside the container** (has Node 22): - -```bash -# WRONG: running from host with Node 18 -cd playground && npm run dev # Fails - -# CORRECT: task run starts playground inside container automatically -``` +Editing `.env` while the container is running has **no effect**. You must +`docker compose down && docker compose up -d`, then reinstall Python deps. ## Next.js Lock File -After crashes, `.next/dev/lock` becomes stale, preventing restart: - -```bash -sudo docker exec ten_agent_dev bash -c "rm -f /app/playground/.next/dev/lock" -``` - -Always use nuclear restart after playground crashes. +After crashes, `.next/dev/lock` becomes stale, preventing restart. Delete it +and do a full restart. See [Operations and Restarts](deep_dives/operations_restarts.md). ## Python Deps Not Persisted -Python dependencies are installed into the container's filesystem and are lost -on container restart. Always reinstall after `docker compose down && up`: - -```bash -docker exec ten_agent_dev bash -c \ - "cd /app/agents/examples/voice-assistant-advanced/tenapp && bash scripts/install_python_deps.sh" -``` +Python dependencies are lost on container restart. Always reinstall after +`docker compose down && up`. -## tman Install Creates Symlinks - -Never manually create symlinks with `ln -s` for extensions. -Always use `tman install` which resolves dependencies and creates correct links: +## tman install Can Wipe bin/main -```bash -docker exec ten_agent_dev bash -c "cd /app/agents/examples//tenapp && tman install" -``` +Running `tman install` when system dependencies have newer versions replaces +the runtime packages and **deletes `bin/main`**. Use `task install` (full +rebuild) instead of bare `tman install`. Signs: Worker fails with +`bin/main: No such file or directory` in logs. -**Important:** If `tman install` doesn't create a symlink for a new extension (e.g., after -adding it to `manifest.json`), create it manually as a fallback: +## tman Install Creates Symlinks -```bash -sudo docker exec ten_agent_dev bash -c \ - "ln -sf /app/agents/ten_packages/extension/my_ext \ - /app/agents/examples//tenapp/ten_packages/extension/my_ext" -``` +Never manually `ln -s` for extensions. Use `tman install` which resolves +dependencies and creates correct links. If a symlink is missing after +`tman install`, create it manually as a fallback. ## docker cp Creates Nested Directories -When using `docker cp` to update extension code in the container, beware of -trailing slashes creating nested directories: - -```bash -# WRONG — creates /app/.../deepgram_tts/deepgram_tts/ (nested) -sudo docker cp ./deepgram_tts/ container:/app/.../deepgram_tts/ - -# CORRECT — copy contents into existing directory -sudo docker cp ./deepgram_tts/. container:/app/.../deepgram_tts/ -``` - -If you see `ModuleNotFoundError: No module named 'ten_packages.extension.X'` -after a `docker cp`, check for nested directories inside the extension folder. - -## tman install Can Wipe bin/main - -Running `tman install` when system dependencies have newer versions will replace -the runtime packages, which **deletes `bin/main`**. You must run the full -`task install` (not just `tman install`) to rebuild it: - -```bash -# This alone can break things if runtime versions changed: -docker exec ten_agent_dev bash -c "cd /app/.../tenapp && tman install" - -# This is safe — rebuilds bin/main after tman install: -docker exec ten_agent_dev bash -c "cd /app/agents/examples/ && task install" -``` - -Signs: Worker fails with `bin/main: No such file or directory` in logs. +When using `docker cp` to update extension code, trailing slashes create +nested directories. Use `docker cp ./ext/. container:/path/ext/` syntax. +Signs: `ModuleNotFoundError: No module named 'ten_packages.extension.X'`. ## Audio Routing: Split at Source Only @@ -152,84 +84,34 @@ When routing audio to multiple destinations, the split must happen at the source node (e.g., `agora_rtc`), not at intermediate nodes. Splitting from intermediate nodes can cause crashes. -```json -// CORRECT: agora_rtc sends pcm_frame to both stt AND vad -{"extension": "agora_rtc", "audio_frame": [ - {"name": "pcm_frame", "dest": [{"extension": "stt"}, {"extension": "vad"}]} -]} -``` - ## Frontend Caches Graph List -The playground caches the `/graphs` API response. When adding or removing graphs -from `property.json`, a nuclear restart is required — simple server restart -is not enough. +The playground caches the `/graphs` API response. When adding or removing +graphs from `property.json`, a full restart is required — simple server +restart is not enough. ## Manifest Module Name Must Match The `name` field in extension `manifest.json` must exactly match the `addon` field used in graph nodes in `property.json`. Mismatches cause silent failures. +## next-server Holds Port 3000 + +Killing `node` and `bun` is not enough — `next-server` is a separate process +that holds port 3000. If port 3000 is occupied, Next.js silently starts on +3001+ which isn't Docker-exposed, making the frontend appear down. + ## Apple Silicon Docker Docker containers may need Rosetta for x86 images on Apple Silicon Macs. -Enable in Docker Desktop: Settings → General → Use Rosetta for x86_64/amd64 emulation. +Enable in Docker Desktop: Settings > General > Use Rosetta. ## Windows Line Endings -Before cloning on Windows, configure git to preserve Unix line endings: - -```bash -git config --global core.autocrlf false -``` - -## Nuclear Restart Recipe - -When in doubt, use the nuclear option. **Must kill `next-server` too** — it -holds port 3000 even after its parent `node` process is killed: - -```bash -# 1. Kill EVERYTHING (including next-server which holds port 3000) -sudo docker exec ten_agent_dev bash -c \ - "pkill -9 -f 'bin/api'; pkill -9 -f bun; pkill -9 -f node; pkill -9 -f next-server; pkill -9 -f tman" - -# 2. Clean up stale files -sudo docker exec ten_agent_dev bash -c "rm -f /app/playground/.next/dev/lock" - -# 3. Wait for port 3000 TIME_WAIT to clear (critical!) -# If Next.js can't bind port 3000, it silently starts on 3001/3002 which -# isn't exposed by Docker — the frontend appears down. -sleep 30 # or check: docker exec ten_agent_dev bash -c "cat /proc/net/tcp6 | grep ':0BB8' | wc -l" - -# 4. Start -sudo docker exec -d ten_agent_dev bash -c \ - "cd /app/agents/examples/voice-assistant && task run > /tmp/task_run.log 2>&1" - -# 5. Verify (wait ~12s for startup) -sleep 12 -sudo docker exec ten_agent_dev bash -c \ - "curl -s http://localhost:8080/health && curl -s -o /dev/null -w ' Frontend:%{http_code}' http://localhost:3000/" -``` - -**Verify the logs** — check Next.js started on port 3000 (not 3001+): -```bash -sudo docker exec ten_agent_dev bash -c "strings /tmp/task_run.log | grep -E 'Local:|Port|Ready|Error'" -``` - -If you see `Port 3000 is in use`, find and kill the process holding it: -```bash -sudo docker exec ten_agent_dev bash -c \ - "for pid in /proc/[0-9]*/fd/*; do \ - link=\$(readlink \$pid 2>/dev/null); \ - echo \"\$link\" | grep -q socket: && \ - inode=\$(echo \$link | grep -oP '\\d+') && \ - grep -q \$inode /proc/net/tcp6 2>/dev/null && \ - grep \$inode /proc/net/tcp6 | grep -q ':0BB8' && \ - echo PID=\$(echo \$pid | cut -d/ -f3) && break; \ - done" -``` +Before cloning on Windows: `git config --global core.autocrlf false` ## Related Deep Dives +- [Operations and Restarts](deep_dives/operations_restarts.md) — Full restart procedures, port debugging, recovery - [Deployment](deep_dives/deployment.md) — Production setup, persistent startup - [Server Architecture](deep_dives/server_architecture.md) — Worker lifecycle, session management diff --git a/docs/ai/L1/deep_dives/_index.md b/docs/ai/L1/deep_dives/_index.md index 34502c601f..07b5b13b20 100644 --- a/docs/ai/L1/deep_dives/_index.md +++ b/docs/ai/L1/deep_dives/_index.md @@ -7,3 +7,4 @@ | [testing.md](testing.md) | All 15 TTS + 10 ASR guarder tests, pass criteria, config files, debugging | Running or debugging tests for an extension | | [deployment.md](deployment.md) | Docker, Cloudflare, Nginx, Grafana monitoring | Deploying to production or setting up monitoring | | [server_architecture.md](server_architecture.md) | Go server, property injection, worker lifecycle | Understanding server internals or debugging | +| [operations_restarts.md](operations_restarts.md) | Full restart procedures, port debugging, recovery| Restarting services, crash recovery, port conflicts| diff --git a/docs/ai/L1/deep_dives/operations_restarts.md b/docs/ai/L1/deep_dives/operations_restarts.md new file mode 100644 index 0000000000..cf824560ed --- /dev/null +++ b/docs/ai/L1/deep_dives/operations_restarts.md @@ -0,0 +1,171 @@ +# Operations and Restarts + +> **When to Read This:** Load this document when you need to restart services, +> debug port conflicts, recover from crashes, or clean up zombie processes. + +## When to Do a Full Restart + +| What Changed | Action | +| ------------------------------- | ---------------------------------------------------- | +| `property.json` (graphs added) | Full restart (frontend caches graph list) | +| `property.json` (config only) | No restart needed (loaded per session) | +| `.env` | `docker compose down && docker compose up -d` + deps | +| Python code | Restart server only | +| Go code | `task install` then restart server | +| Container restart | Reinstall Python deps, then `task run` | + +## Full Restart Procedure + +Must kill `next-server` too — it holds port 3000 even after `node`/`bun` die: + +```bash +# 1. Kill EVERYTHING +sudo docker exec ten_agent_dev bash -c \ + "pkill -9 -f 'bin/api'; pkill -9 -f bun; pkill -9 -f node; \ + pkill -9 -f next-server; pkill -9 -f tman" + +# 2. Clean up stale files +sudo docker exec ten_agent_dev bash -c "rm -f /app/playground/.next/dev/lock" + +# 3. Wait for port 3000 TIME_WAIT to clear +# If Next.js can't bind port 3000, it silently starts on 3001/3002 +# which isn't exposed by Docker — the frontend appears down. +sleep 30 + +# 4. Start +sudo docker exec -d ten_agent_dev bash -c \ + "cd /app/agents/examples/ && task run > /tmp/task_run.log 2>&1" + +# 5. Verify (wait ~12s for startup) +sleep 12 +sudo docker exec ten_agent_dev bash -c \ + "curl -s http://localhost:8080/health && \ + curl -s -o /dev/null -w ' Frontend:%{http_code}' http://localhost:3000/" +``` + +## Verification + +Check Next.js started on port 3000 (not 3001+): + +```bash +sudo docker exec ten_agent_dev bash -c \ + "strings /tmp/task_run.log | grep -E 'Local:|Port|Ready|Error'" +``` + +Expected output: +``` + - Local: http://localhost:3000 + Ready in 2.1s +``` + +If you see `Port 3000 is in use`, the frontend is on the wrong port. + +## Zombie Worker Cleanup + +Worker processes (`bin/main`) run inside Docker but can survive server restarts: + +```bash +# Check for zombies +sudo docker exec ten_agent_dev bash -c \ + "ps aux | grep 'bin/main' | grep -v grep" + +# Kill them +sudo docker exec ten_agent_dev bash -c \ + "pkill -9 -f 'bin/main'" +``` + +Always kill zombies before restarting the server. + +## Stale Lock Cleanup + +After crashes, `.next/dev/lock` becomes stale: + +```bash +sudo docker exec ten_agent_dev bash -c "rm -f /app/playground/.next/dev/lock" +``` + +Also clear the Next.js cache if React version errors appear: + +```bash +sudo docker exec ten_agent_dev bash -c "rm -rf /app/playground/.next" +``` + +## Port 3000 Conflict Debugging + +If Next.js reports "Port 3000 is in use", find the process holding it: + +```bash +sudo docker exec ten_agent_dev bash -c \ + "for pid in /proc/[0-9]*/fd/*; do \ + link=\$(readlink \$pid 2>/dev/null); \ + echo \"\$link\" | grep -q socket: && \ + inode=\$(echo \$link | grep -oP '\\d+') && \ + grep -q \$inode /proc/net/tcp6 2>/dev/null && \ + grep \$inode /proc/net/tcp6 | grep -q ':0BB8' && \ + echo PID=\$(echo \$pid | cut -d/ -f3) && break; \ + done" +``` + +Kill the PID, wait for TIME_WAIT to clear (~30s), then restart. + +If no PID is found but port is still busy, it's in TIME_WAIT state. Check: + +```bash +sudo docker exec ten_agent_dev bash -c \ + "cat /proc/net/tcp6 | grep ':0BB8'" +``` + +State `06` = TIME_WAIT. Wait 30-60 seconds for it to clear. + +## .env and Container Restart Recovery + +`.env` is loaded at container startup only. After editing: + +```bash +cd /home/ubuntu/ten-framework/ai_agents +docker compose down && docker compose up -d +``` + +Then reinstall everything (Python deps are not persisted): + +```bash +sudo docker exec ten_agent_dev bash -c \ + "cd /app/agents/examples/ && task install" +``` + +## Copying Extension Code to Running Container + +When iterating on extension code locally: + +```bash +# Copy updated files (use /. to avoid nested dirs) +sudo docker cp ./agents/ten_packages/extension/my_ext/. \ + ten_agent_dev:/app/agents/ten_packages/extension/my_ext/ + +# Verify symlink exists in the example's tenapp +sudo docker exec ten_agent_dev bash -c \ + "ls -la /app/agents/examples//tenapp/ten_packages/extension/my_ext" + +# If missing, create it manually +sudo docker exec ten_agent_dev bash -c \ + "ln -sf /app/agents/ten_packages/extension/my_ext \ + /app/agents/examples//tenapp/ten_packages/extension/my_ext" +``` + +Then do a full restart. + +## After Container Restart Checklist + +1. Reinstall Python dependencies +2. Rebuild Go binary (`task install`) +3. Kill any zombie workers +4. Remove stale `.next/dev/lock` +5. Start with `task run` +6. Verify health endpoint and frontend status code + +## See Also + +- [Back to Gotchas](../07_gotchas.md) +- [Back to Workflows](../05_workflows.md) +- [Deployment](deployment.md) — Production setup, Cloudflare, Nginx +- [Server Architecture](server_architecture.md) — Worker lifecycle From 2fd1ace327e044d4b990231808a3b326acfa1725 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 13:45:34 +0000 Subject: [PATCH 06/18] =?UTF-8?q?fix:=20address=20codex=20review=20?= =?UTF-8?q?=E2=80=94=20connect=20fail-fast=20and=20error=20handling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit issue 1 (high): _connect() now always raises after calling the error callback on 401. previously it returned control to the caller with self._ws == None, causing a secondary AttributeError that masked the real auth failure. issue 2 (high): EVENT_TTS_ERROR on non-final chunks is logged as a warning but not sent as a data event. sending error data for transient partial-stream failures confuses the test harness and the base class state machine. errors are only surfaced via _finalize_request() on the final chunk (text_input_end=True), which is the correct contract. open question: request state fields (current_request_id, sent_ts, _audio_start_sent) are shared mutable state. however, the base class AsyncTTS2BaseExtension serializes request_tts() calls — it does not overlap them. this is confirmed by the interleaved_requests test passing, which exercises rapid request_id switching. --- .../extension/deepgram_tts/deepgram_tts.py | 9 +++--- .../extension/deepgram_tts/extension.py | 31 ++++++++++++------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py index 9f532bedef..ab774ae3b1 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py @@ -260,10 +260,11 @@ async def _connect(self) -> None: if "401" in error_message or "Unauthorized" in error_message: if self.send_fatal_tts_error: await self.send_fatal_tts_error(error_message=error_message) - else: - raise DeepgramTTSConnectionException( - status_code=401, body=error_message - ) from e + # Always raise so callers don't proceed + # with self._ws == None + raise DeepgramTTSConnectionException( + status_code=401, body=error_message + ) from e else: self.ten_env.log_error(f"Deepgram TTS connection failed: {e}") if self.send_non_fatal_tts_error: diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py index 749f70f4eb..7f18db2b72 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py @@ -350,25 +350,34 @@ async def _process_tts_text(self, text: str, t: TTSTextInput) -> None: break elif event_status == EVENT_TTS_ERROR: - self.ten_env.log_error( - "Received TTS_ERROR event from " "Deepgram TTS" - ) error_msg = ( data_msg.decode("utf-8") if isinstance(data_msg, bytes) else str(data_msg) ) + self.ten_env.log_error(f"TTS_ERROR from Deepgram: {error_msg}") + error = ModuleError( + message=error_msg, + module=ModuleType.TTS, + code=ModuleErrorCode.NON_FATAL_ERROR, + vendor_info=ModuleErrorVendorInfo(vendor=self.vendor()), + ) if t.text_input_end: + # Final chunk: surface error and + # finalize the request await self._finalize_request( TTSAudioEndReason.ERROR, - error=ModuleError( - message=error_msg, - module=ModuleType.TTS, - code=(ModuleErrorCode.NON_FATAL_ERROR), - vendor_info=ModuleErrorVendorInfo( - vendor=self.vendor() - ), - ), + error=error, + ) + else: + # Non-final chunk: log only. The base + # class will send subsequent chunks for + # this request_id; errors on partial + # streaming are transient. + self.ten_env.log_warn( + f"Transient TTS error on non-final " + f"chunk for {t.request_id}: " + f"{error_msg}" ) break From ace659fe5bd604ee6766839e21fdf0cdb6a2847b Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 13:50:53 +0000 Subject: [PATCH 07/18] test: add state machine, recovery, and redaction tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit address codex review gaps vs cartesia_tts test coverage: - test_sequential_requests: 3 requests with different IDs, validates request_id in audio_start and audio_end events - test_reconnect_after_error: first request errors mid-stream, second request completes successfully (recovery) - test_config_redacts_api_key: to_str(sensitive_handling=True) does not leak the API key - test_client_empty_text_yields_end: unit test on client.get() for empty text — yields END immediately, no WS connection - test_client_whitespace_text_yields_end: same for whitespace standalone tests: 18/18 passed (was 13) --- .../deepgram_tts/tests/test_state_machine.py | 289 ++++++++++++++++++ 1 file changed, 289 insertions(+) create mode 100644 ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py new file mode 100644 index 0000000000..96cdb72ad3 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py @@ -0,0 +1,289 @@ +import sys +from pathlib import Path + +# Add project root to sys.path +project_root = str(Path(__file__).resolve().parents[6]) +if project_root not in sys.path: + sys.path.insert(0, project_root) + +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +import asyncio +import copy +import json +from unittest.mock import patch, AsyncMock, MagicMock + +from ten_runtime import ( + ExtensionTester, + TenEnvTester, + Data, +) +from ten_ai_base.struct import TTSTextInput +from deepgram_tts.deepgram_tts import ( + EVENT_TTS_RESPONSE, + EVENT_TTS_END, + EVENT_TTS_TTFB_METRIC, + EVENT_TTS_ERROR, + DeepgramTTSClient, +) +from deepgram_tts.config import DeepgramTTSConfig + +MOCK_CONFIG = { + "params": { + "api_key": "test_api_key", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000, + }, +} + + +def _create_mock_client(): + mock = MagicMock() + mock.start = AsyncMock() + mock.stop = AsyncMock() + mock.cancel = AsyncMock() + mock.reset_ttfb = lambda: None + mock.mark_needs_reconnect = lambda: None + + fake_audio = b"\x00\x01" * 200 + + async def mock_get(text): + yield (100, EVENT_TTS_TTFB_METRIC) + yield (fake_audio, EVENT_TTS_RESPONSE) + yield (None, EVENT_TTS_END) + + mock.get.side_effect = mock_get + return mock + + +# ================ test sequential requests ================ +class SequentialRequestsTester(ExtensionTester): + """Send 3 requests with different IDs sequentially. + + Each request should produce tts_audio_start, audio + frames, and tts_audio_end with the correct request_id. + """ + + def __init__(self): + super().__init__() + self.completed_request_ids = [] + self.audio_start_ids = [] + self.expected_ids = [ + "seq_req_1", + "seq_req_2", + "seq_req_3", + ] + self.send_index = 0 + + def on_start(self, ten_env_tester: TenEnvTester) -> None: + ten_env_tester.log_info("Sequential requests test started.") + self._send_next(ten_env_tester) + ten_env_tester.on_start_done() + + def _send_next(self, ten_env_tester: TenEnvTester) -> None: + if self.send_index >= len(self.expected_ids): + return + req_id = self.expected_ids[self.send_index] + tts_input = TTSTextInput( + request_id=req_id, + text=f"Hello from request {self.send_index + 1}.", + text_input_end=True, + ) + data = Data.create("tts_text_input") + data.set_property_from_json(None, tts_input.model_dump_json()) + ten_env_tester.send_data(data) + self.send_index += 1 + + def on_data(self, ten_env: TenEnvTester, data) -> None: + name = data.get_name() + if name == "tts_audio_start": + json_str, _ = data.get_property_to_json("") + d = json.loads(json_str) if json_str else {} + rid = d.get("request_id", "") + self.audio_start_ids.append(rid) + elif name == "tts_audio_end": + json_str, _ = data.get_property_to_json("") + d = json.loads(json_str) if json_str else {} + rid = d.get("request_id", "") + self.completed_request_ids.append(rid) + ten_env.log_info(f"Completed request: {rid}") + if len(self.completed_request_ids) < len(self.expected_ids): + self._send_next(ten_env) + else: + ten_env.stop_test() + + +@patch("deepgram_tts.extension.DeepgramTTSClient") +def test_sequential_requests(MockClient): + """Each sequential request should complete with its own + request_id in audio_start and audio_end.""" + MockClient.return_value = _create_mock_client() + + tester = SequentialRequestsTester() + tester.set_test_mode_single("deepgram_tts", json.dumps(MOCK_CONFIG)) + tester.run() + + assert tester.completed_request_ids == [ + "seq_req_1", + "seq_req_2", + "seq_req_3", + ], ( + f"Expected 3 sequential completions, got " + f"{tester.completed_request_ids}" + ) + assert tester.audio_start_ids == [ + "seq_req_1", + "seq_req_2", + "seq_req_3", + ], f"audio_start ids mismatch: {tester.audio_start_ids}" + + +# ================ test reconnect after error ================ +class ReconnectAfterErrorTester(ExtensionTester): + """First request errors, second request should succeed. + + Validates that the client recovers after a mid-stream + failure. + """ + + def __init__(self): + super().__init__() + self.error_received = False + self.second_audio_end = False + + def on_start(self, ten_env_tester: TenEnvTester) -> None: + # First request will trigger an error + tts_input = TTSTextInput( + request_id="err_req_1", + text="This will error.", + text_input_end=True, + ) + data = Data.create("tts_text_input") + data.set_property_from_json(None, tts_input.model_dump_json()) + ten_env_tester.send_data(data) + ten_env_tester.on_start_done() + + def on_data(self, ten_env: TenEnvTester, data) -> None: + name = data.get_name() + if name == "tts_audio_end": + if not self.error_received: + # First request ended (with error) — send + # second request + self.error_received = True + tts_input = TTSTextInput( + request_id="ok_req_2", + text="This should work.", + text_input_end=True, + ) + data2 = Data.create("tts_text_input") + data2.set_property_from_json(None, tts_input.model_dump_json()) + ten_env.send_data(data2) + else: + self.second_audio_end = True + ten_env.stop_test() + + +@patch("deepgram_tts.extension.DeepgramTTSClient") +def test_reconnect_after_error(MockClient): + """After an error, subsequent requests should succeed.""" + call_count = 0 + + def create_mock(): + mock = MagicMock() + mock.start = AsyncMock() + mock.stop = AsyncMock() + mock.cancel = AsyncMock() + mock.reset_ttfb = lambda: None + mock.mark_needs_reconnect = lambda: None + + fake_audio = b"\x00\x01" * 200 + + async def mock_get(text): + nonlocal call_count + call_count += 1 + if call_count == 1: + # First call: error + yield ( + b"Simulated error", + EVENT_TTS_ERROR, + ) + else: + # Subsequent calls: success + yield (100, EVENT_TTS_TTFB_METRIC) + yield (fake_audio, EVENT_TTS_RESPONSE) + yield (None, EVENT_TTS_END) + + mock.get.side_effect = mock_get + return mock + + MockClient.return_value = create_mock() + + tester = ReconnectAfterErrorTester() + tester.set_test_mode_single("deepgram_tts", json.dumps(MOCK_CONFIG)) + tester.run() + + assert ( + tester.second_audio_end + ), "Second request should complete after first errored." + + +# ================ test config redaction ================ +def test_config_redacts_api_key(): + """to_str(sensitive_handling=True) must not leak the + API key.""" + config = DeepgramTTSConfig( + params={ + "api_key": "super-secret-key-12345", + "model": "aura-2-thalia-en", + } + ) + config.update_params() + + safe_str = config.to_str(sensitive_handling=True) + + assert "super-secret-key-12345" not in safe_str + assert "aura-2-thalia-en" in safe_str + + +# ================ test empty text yields END ================ +def test_client_empty_text_yields_end(): + """get() with empty text should yield EVENT_TTS_END + immediately without connecting.""" + + async def _run(): + ten_env = MagicMock() + ten_env.log_warn = MagicMock() + config = DeepgramTTSConfig(api_key="test") + client = DeepgramTTSClient(config=config, ten_env=ten_env) + + events = [] + async for data, event in client.get(""): + events.append(event) + + assert events == [EVENT_TTS_END] + assert client._ws is None # no connection made + + asyncio.run(_run()) + + +def test_client_whitespace_text_yields_end(): + """get() with whitespace-only text should yield + EVENT_TTS_END.""" + + async def _run(): + ten_env = MagicMock() + ten_env.log_warn = MagicMock() + config = DeepgramTTSConfig(api_key="test") + client = DeepgramTTSClient(config=config, ten_env=ten_env) + + events = [] + async for data, event in client.get(" \n\t "): + events.append(event) + + assert events == [EVENT_TTS_END] + + asyncio.run(_run()) From 2a65917383b37ff3626c61de4354572bedd00056 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 14:05:56 +0000 Subject: [PATCH 08/18] fix: eliminate double error emission on auth failure, add targeted tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit remove error callbacks from DeepgramTTSClient._connect() — error reporting is now solely the caller's responsibility. this eliminates the double-report where _connect() called send_fatal_tts_error and then raised, causing _handle_connection_error to send a second error. consolidate error handlers to use _finalize_request() which emits exactly one error via finish_request(error=...). new tests: - test_auth_error_single_emission: 401 produces exactly 1 error event - test_nonfinal_error_not_surfaced: error on non-final chunk is logged but not sent as public data event (documented contract) standalone tests: 20/20 passed --- .../extension/deepgram_tts/deepgram_tts.py | 13 -- .../extension/deepgram_tts/extension.py | 31 +--- .../deepgram_tts/tests/test_state_machine.py | 150 ++++++++++++++++++ 3 files changed, 157 insertions(+), 37 deletions(-) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py index ab774ae3b1..4896b997c9 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py @@ -5,7 +5,6 @@ # import asyncio import json -from collections.abc import Callable from datetime import datetime from typing import AsyncIterator @@ -46,13 +45,9 @@ def __init__( self, config: DeepgramTTSConfig, ten_env: AsyncTenEnv, - send_fatal_tts_error: Callable[[str], asyncio.Future] | None = None, - send_non_fatal_tts_error: Callable[[str], asyncio.Future] | None = None, ): self.config = config self.ten_env = ten_env - self.send_fatal_tts_error = send_fatal_tts_error - self.send_non_fatal_tts_error = send_non_fatal_tts_error self._ws: ClientConnection | None = None self._is_cancelled = False @@ -258,19 +253,11 @@ async def _connect(self) -> None: except Exception as e: error_message = str(e) if "401" in error_message or "Unauthorized" in error_message: - if self.send_fatal_tts_error: - await self.send_fatal_tts_error(error_message=error_message) - # Always raise so callers don't proceed - # with self._ws == None raise DeepgramTTSConnectionException( status_code=401, body=error_message ) from e else: self.ten_env.log_error(f"Deepgram TTS connection failed: {e}") - if self.send_non_fatal_tts_error: - await self.send_non_fatal_tts_error( - error_message=error_message - ) raise async def _ensure_connection(self) -> None: diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py index 7f18db2b72..b56e8c1230 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py @@ -138,8 +138,6 @@ def _create_client(self, ten_env: AsyncTenEnv) -> DeepgramTTSClient: return DeepgramTTSClient( config=self.config, ten_env=ten_env, - send_fatal_tts_error=self.send_fatal_tts_error, - send_non_fatal_tts_error=(self.send_non_fatal_tts_error), ) async def _ensure_client(self) -> None: @@ -271,15 +269,7 @@ async def request_tts(self, t: TTSTextInput) -> None: code=ModuleErrorCode.NON_FATAL_ERROR, vendor_info=ModuleErrorVendorInfo(vendor=self.vendor()), ) - await self.send_tts_error( - request_id=self.current_request_id, - error=error, - ) - await self.finish_request( - request_id=self.current_request_id, - reason=TTSAudioEndReason.ERROR, - error=error, - ) + await self._finalize_request(TTSAudioEndReason.ERROR, error=error) if isinstance(e, ConnectionRefusedError): await self._reconnect_client() @@ -388,10 +378,11 @@ async def _process_tts_text(self, text: str, t: TTSTextInput) -> None: async def _handle_connection_error( self, e: DeepgramTTSConnectionException ) -> None: - """Handle Deepgram connection errors.""" - self.ten_env.log_error( - f"DeepgramTTSConnectionException in request_tts: " f"{e.body}" - ) + """Handle Deepgram connection errors. + + Sends exactly one error event via _finalize_request. + """ + self.ten_env.log_error(f"DeepgramTTSConnectionException: {e.body}") if e.status_code == 401: code = ModuleErrorCode.FATAL_ERROR else: @@ -407,15 +398,7 @@ async def _handle_connection_error( message=e.body, ), ) - await self.send_tts_error( - request_id=self.current_request_id, - error=error, - ) - await self.finish_request( - request_id=self.current_request_id, - reason=TTSAudioEndReason.ERROR, - error=error, - ) + await self._finalize_request(TTSAudioEndReason.ERROR, error=error) def _setup_recorder(self, request_id: str) -> None: """Set up PCMWriter for a new request.""" diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py index 96cdb72ad3..a6307bc1e6 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py @@ -287,3 +287,153 @@ async def _run(): assert events == [EVENT_TTS_END] asyncio.run(_run()) + + +# ================ test 401 emits exactly one error ================ +class AuthErrorTester(ExtensionTester): + """Validates that a 401 auth failure emits exactly one + error event and one terminal audio_end.""" + + def __init__(self): + super().__init__() + self.error_count = 0 + self.audio_end_count = 0 + + def on_start(self, ten_env_tester: TenEnvTester) -> None: + tts_input = TTSTextInput( + request_id="auth_err_req", + text="This should fail with 401.", + text_input_end=True, + ) + data = Data.create("tts_text_input") + data.set_property_from_json(None, tts_input.model_dump_json()) + ten_env_tester.send_data(data) + ten_env_tester.on_start_done() + + def on_data(self, ten_env: TenEnvTester, data) -> None: + name = data.get_name() + if name == "error": + self.error_count += 1 + elif name == "tts_audio_end": + self.audio_end_count += 1 + ten_env.stop_test() + + +@patch("deepgram_tts.extension.DeepgramTTSClient") +def test_auth_error_single_emission(MockClient): + """401 should produce exactly 1 error event, not + duplicates.""" + from deepgram_tts.deepgram_tts import ( + DeepgramTTSConnectionException, + ) + + mock = MagicMock() + mock.start = AsyncMock() + mock.stop = AsyncMock() + mock.cancel = AsyncMock() + mock.reset_ttfb = lambda: None + mock.mark_needs_reconnect = lambda: None + + async def mock_get_auth_fail(text): + raise DeepgramTTSConnectionException( + status_code=401, body="Unauthorized" + ) + yield # make it a generator # pragma: no cover + + mock.get.side_effect = mock_get_auth_fail + MockClient.return_value = mock + + tester = AuthErrorTester() + tester.set_test_mode_single("deepgram_tts", json.dumps(MOCK_CONFIG)) + tester.run() + + assert tester.error_count == 1, ( + f"Expected exactly 1 error event, got " f"{tester.error_count}" + ) + + +# ================ test non-final error contract ================ +class NonFinalErrorTester(ExtensionTester): + """Validates that an error on a non-final chunk does NOT + produce a public error event. Partial stream errors are + transient — only logged, not surfaced to callers.""" + + def __init__(self): + super().__init__() + self.error_count = 0 + self.audio_end_received = False + + def on_start(self, ten_env_tester: TenEnvTester) -> None: + # First chunk: non-final, will error + tts_input = TTSTextInput( + request_id="nonfinal_req", + text="First chunk errors.", + text_input_end=False, + ) + data = Data.create("tts_text_input") + data.set_property_from_json(None, tts_input.model_dump_json()) + ten_env_tester.send_data(data) + + # Second chunk: final, succeeds + tts_input2 = TTSTextInput( + request_id="nonfinal_req", + text="Second chunk works.", + text_input_end=True, + ) + data2 = Data.create("tts_text_input") + data2.set_property_from_json(None, tts_input2.model_dump_json()) + ten_env_tester.send_data(data2) + ten_env_tester.on_start_done() + + def on_data(self, ten_env: TenEnvTester, data) -> None: + name = data.get_name() + if name == "error": + self.error_count += 1 + elif name == "tts_audio_end": + self.audio_end_received = True + ten_env.stop_test() + + +@patch("deepgram_tts.extension.DeepgramTTSClient") +def test_nonfinal_error_not_surfaced(MockClient): + """Error on non-final chunk should not emit public + error event. This is the intended contract: partial + stream errors are transient.""" + call_count = 0 + + def create_mock(): + mock = MagicMock() + mock.start = AsyncMock() + mock.stop = AsyncMock() + mock.cancel = AsyncMock() + mock.reset_ttfb = lambda: None + mock.mark_needs_reconnect = lambda: None + + fake_audio = b"\x00\x01" * 200 + + async def mock_get(text): + nonlocal call_count + call_count += 1 + if call_count == 1: + yield (b"Transient error", EVENT_TTS_ERROR) + else: + yield (100, EVENT_TTS_TTFB_METRIC) + yield (fake_audio, EVENT_TTS_RESPONSE) + yield (None, EVENT_TTS_END) + + mock.get.side_effect = mock_get + return mock + + MockClient.return_value = create_mock() + + tester = NonFinalErrorTester() + tester.set_test_mode_single("deepgram_tts", json.dumps(MOCK_CONFIG)) + tester.run() + + assert tester.error_count == 0, ( + f"Non-final error should not produce public error " + f"event, got {tester.error_count}" + ) + assert ( + tester.audio_end_received + ), "Request should still complete after non-final error" From f197de3d6eb3b0ee7e6cf2294dd7b8cca6f98e66 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 14:36:04 +0000 Subject: [PATCH 09/18] docs: add tar sync method, cache cleanup, fix guarder test count - operations_restarts.md: add tar-based container sync that excludes __pycache__ and .pytest_cache (recommended over docker cp). add cleanup command for stale cache artifacts in container. - testing.md: fix TTS guarder count from 15 to 16. add container sync guidance before running tests. --- docs/ai/L1/deep_dives/operations_restarts.md | 22 +++++++++++++++++++- docs/ai/L1/deep_dives/testing.md | 16 +++++++++++--- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/docs/ai/L1/deep_dives/operations_restarts.md b/docs/ai/L1/deep_dives/operations_restarts.md index cf824560ed..c4bff1c10f 100644 --- a/docs/ai/L1/deep_dives/operations_restarts.md +++ b/docs/ai/L1/deep_dives/operations_restarts.md @@ -138,10 +138,17 @@ sudo docker exec ten_agent_dev bash -c \ When iterating on extension code locally: ```bash -# Copy updated files (use /. to avoid nested dirs) +# Option 1: docker cp with /. suffix (avoids nested dirs) sudo docker cp ./agents/ten_packages/extension/my_ext/. \ ten_agent_dev:/app/agents/ten_packages/extension/my_ext/ +# Option 2: tar with cache exclusion (recommended — avoids +# __pycache__ and .pytest_cache causing import errors) +tar --exclude='__pycache__' --exclude='.pytest_cache' \ + -C ai_agents/agents/ten_packages/extension/my_ext -cf - . | \ + sudo docker exec -i ten_agent_dev tar \ + -C /app/agents/ten_packages/extension/my_ext -xf - + # Verify symlink exists in the example's tenapp sudo docker exec ten_agent_dev bash -c \ "ls -la /app/agents/examples//tenapp/ten_packages/extension/my_ext" @@ -154,6 +161,19 @@ sudo docker exec ten_agent_dev bash -c \ Then do a full restart. +**Common pitfall:** If `docker cp` copies `__pycache__` or `.pytest_cache` +from your local machine into the container, it can cause `ImportError` or +stale bytecode during test collection. Use the tar method above or clean +the container directory before copying: + +```bash +sudo docker exec ten_agent_dev bash -c \ + "find /app/agents/ten_packages/extension/my_ext \ + -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null; \ + find /app/agents/ten_packages/extension/my_ext \ + -type d -name .pytest_cache -exec rm -rf {} + 2>/dev/null" +``` + ## After Container Restart Checklist 1. Reinstall Python dependencies diff --git a/docs/ai/L1/deep_dives/testing.md b/docs/ai/L1/deep_dives/testing.md index 1ad2d72ecd..7c77e5ce0d 100644 --- a/docs/ai/L1/deep_dives/testing.md +++ b/docs/ai/L1/deep_dives/testing.md @@ -24,16 +24,26 @@ docker exec ten_agent_dev bash -c \ docker exec ten_agent_dev bash -c \ "cd /app && task test-extension-no-install EXTENSION=agents/ten_packages/extension/deepgram_tts" -# TTS guarder (all 15 tests) +# TTS guarder (16 tests) docker exec ten_agent_dev bash -c "cd /app && task tts-guarder-test EXTENSION=deepgram_tts" -# ASR guarder (all 10 tests) +# ASR guarder (10 tests) docker exec ten_agent_dev bash -c "cd /app && task asr-guarder-test EXTENSION=azure_asr_python" -# Specific test only +# Specific test only (faster iteration on failures) docker exec ten_agent_dev bash -c "cd /app && task tts-guarder-test EXTENSION=deepgram_tts -- -k test_flush" ``` +**Before running tests**, sync your local code into the container. Use tar +to exclude cache artifacts that cause import errors: + +```bash +tar --exclude='__pycache__' --exclude='.pytest_cache' \ + -C ai_agents/agents/ten_packages/extension/my_ext -cf - . | \ + sudo docker exec -i ten_agent_dev tar \ + -C /app/agents/ten_packages/extension/my_ext -xf - +``` + ## Extension Standalone Tests Each extension can have `tests/` with a `bin/start` entry point: From 8a3282352db41ffd9c60c3903598d8faa71b3d2c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 15:15:01 +0000 Subject: [PATCH 10/18] =?UTF-8?q?fix:=20address=20code=20review=20?= =?UTF-8?q?=E2=80=94=20401=20detection,=20dead=20code,=20dump=20writes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - use websockets.exceptions.InvalidStatus for typed 401 detection with string-match fallback for non-websockets exceptions - remove dead send_fatal/non_fatal_tts_error methods (unused after client callback removal) - remove redundant "LOG_CATEGORY_KEY_POINT: " log prefix - await _write_dump() and _setup_recorder() directly instead of fire-and-forget asyncio.create_task (errors were silently lost) - remove unused asyncio import - remove duplicate pathlib import in test_basic.py graph connections verified: voice_assistant_deepgram_tts has the same 3 connection blocks as the working voice_assistant graph. the main_python extension handles LLM/TTS routing internally. --- .../extension/deepgram_tts/deepgram_tts.py | 13 ++++-- .../extension/deepgram_tts/extension.py | 43 +++++-------------- .../deepgram_tts/tests/test_basic.py | 1 - 3 files changed, 20 insertions(+), 37 deletions(-) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py index 4896b997c9..b78c317ab5 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py @@ -10,6 +10,7 @@ import websockets from websockets.asyncio.client import ClientConnection +from websockets.exceptions import InvalidStatus from .config import DeepgramTTSConfig from ten_runtime import AsyncTenEnv @@ -250,15 +251,21 @@ async def _connect(self) -> None: "vendor_status: connected to deepgram tts", category=LOG_CATEGORY_VENDOR, ) + except InvalidStatus as e: + raise DeepgramTTSConnectionException( + status_code=e.response.status_code, + body=str(e), + ) from e except Exception as e: error_message = str(e) + # Fallback string match for non-websockets + # exceptions (e.g., mocked tests) if "401" in error_message or "Unauthorized" in error_message: raise DeepgramTTSConnectionException( status_code=401, body=error_message ) from e - else: - self.ten_env.log_error(f"Deepgram TTS connection failed: {e}") - raise + self.ten_env.log_error(f"Deepgram TTS connection failed: {e}") + raise async def _ensure_connection(self) -> None: if not self._ws: diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py index b56e8c1230..b6ff1006dd 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py @@ -3,7 +3,6 @@ # Licensed under the Apache License, Version 2.0. # See the LICENSE file for more information. # -import asyncio from datetime import datetime import os import traceback @@ -60,8 +59,7 @@ async def on_init(self, ten_env: AsyncTenEnv) -> None: self.config = DeepgramTTSConfig.model_validate_json(config_json_str) self.config.update_params() ten_env.log_info( - f"LOG_CATEGORY_KEY_POINT: " - f"{self.config.to_str(sensitive_handling=True)}", + self.config.to_str(sensitive_handling=True), category=LOG_CATEGORY_KEY_POINT, ) @@ -226,7 +224,7 @@ async def request_tts(self, t: TTSTextInput) -> None: if t.metadata is not None: self.session_id = t.metadata.get("session_id", "") self.current_turn_id = t.metadata.get("turn_id", -1) - self._setup_recorder(t.request_id) + await self._setup_recorder(t.request_id) elif self.current_request_finished: self.ten_env.log_error( f"Received a message for a finished " @@ -301,7 +299,7 @@ async def _process_tts_text(self, text: str, t: TTSTextInput) -> None: f"#{chunk_count}, " f"size: {len(data_msg)} bytes" ) - self._write_dump(data_msg) + await self._write_dump(data_msg) await self.send_tts_audio_data(data_msg) else: self.ten_env.log_debug( @@ -400,7 +398,7 @@ async def _handle_connection_error( ) await self._finalize_request(TTSAudioEndReason.ERROR, error=error) - def _setup_recorder(self, request_id: str) -> None: + async def _setup_recorder(self, request_id: str) -> None: """Set up PCMWriter for a new request.""" if not (self.config and self.config.dump): return @@ -409,7 +407,7 @@ def _setup_recorder(self, request_id: str) -> None: rid for rid in self.recorder_map.keys() if rid != request_id ]: try: - asyncio.create_task(self.recorder_map[old_rid].flush()) + await self.recorder_map[old_rid].flush() del self.recorder_map[old_rid] self.ten_env.log_debug( f"Cleaned up old PCMWriter for " f"request_id: {old_rid}" @@ -431,7 +429,7 @@ def _setup_recorder(self, request_id: str) -> None: f"{request_id}, file: {dump_file_path}" ) - def _write_dump(self, data: bytes) -> None: + async def _write_dump(self, data: bytes) -> None: """Write audio data to dump file if enabled.""" if ( self.config @@ -439,31 +437,10 @@ def _write_dump(self, data: bytes) -> None: and self.current_request_id and self.current_request_id in self.recorder_map ): - asyncio.create_task( - self.recorder_map[self.current_request_id].write(data) - ) - - async def send_fatal_tts_error(self, error_message: str) -> None: - await self.send_tts_error( - request_id=self.current_request_id or "", - error=ModuleError( - message=error_message, - module=ModuleType.TTS, - code=ModuleErrorCode.FATAL_ERROR, - vendor_info=ModuleErrorVendorInfo(vendor=self.vendor()), - ), - ) - - async def send_non_fatal_tts_error(self, error_message: str) -> None: - await self.send_tts_error( - request_id=self.current_request_id or "", - error=ModuleError( - message=error_message, - module=ModuleType.TTS, - code=ModuleErrorCode.NON_FATAL_ERROR, - vendor_info=ModuleErrorVendorInfo(vendor=self.vendor()), - ), - ) + try: + await self.recorder_map[self.current_request_id].write(data) + except Exception as e: + self.ten_env.log_error(f"Dump write failed: {e}") def _current_request_interval_ms(self) -> int: if not self.sent_ts: diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py index e612e4f7e8..230f65f9ce 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py @@ -12,7 +12,6 @@ # Licensed under the Apache License, Version 2.0. # See the LICENSE file for more information. # -from pathlib import Path import json from unittest.mock import patch, AsyncMock import os From 319d50422e8a1041d7dd63dc85af33b7fdf31deb Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 15:39:08 +0000 Subject: [PATCH 11/18] fix: resolve pylint W1404 implicit string concatenation warnings --- .../ten_packages/extension/deepgram_tts/extension.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py index b6ff1006dd..ad527ac716 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py @@ -120,9 +120,7 @@ async def cancel_tts(self) -> None: if self.sent_ts: await self._finalize_request(TTSAudioEndReason.INTERRUPTED) else: - self.ten_env.log_warn( - "No current request found, " "skipping TTS cancellation." - ) + self.ten_env.log_warn("No current request, skipping cancel.") def vendor(self) -> str: return "deepgram" @@ -303,7 +301,7 @@ async def _process_tts_text(self, text: str, t: TTSTextInput) -> None: await self.send_tts_audio_data(data_msg) else: self.ten_env.log_debug( - "Received empty payload for " "TTS response" + "Received empty payload for TTS response" ) if t.text_input_end: await self._finalize_request( @@ -331,7 +329,7 @@ async def _process_tts_text(self, text: str, t: TTSTextInput) -> None: elif event_status == EVENT_TTS_END: self.ten_env.log_info( - "Received TTS_END event from " "Deepgram TTS" + "Received TTS_END event from Deepgram TTS" ) if t.text_input_end: await self._finalize_request(TTSAudioEndReason.REQUEST_END) From ede8cff978b0a7e2ec14c072ab776663ba8cc274 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 15:52:31 +0000 Subject: [PATCH 12/18] fix: reconnect on server errors, break after finalize, cleanup - set _needs_reconnect on Deepgram server-side Error messages, not just Python exceptions. a protocol-level error leaves the websocket in an unknown state. - add break after _finalize_request() in empty-payload branch to stop processing after request is finalized. - remove dead mark_needs_reconnect() method and test mock refs. - replace inline 8.0 timeout with WS_RECV_TIMEOUT constant. --- .../extension/deepgram_tts/deepgram_tts.py | 10 +++++----- .../ten_packages/extension/deepgram_tts/extension.py | 1 + .../extension/deepgram_tts/tests/test_state_machine.py | 4 ---- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py index b78c317ab5..4dd87b5362 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py @@ -22,6 +22,9 @@ EVENT_TTS_ERROR = 3 EVENT_TTS_TTFB_METRIC = 5 +# Seconds to wait for a WebSocket response before timeout +WS_RECV_TIMEOUT = 8.0 + class DeepgramTTSConnectionException(Exception): """Exception raised when Deepgram TTS connection fails""" @@ -126,10 +129,6 @@ def reset_ttfb(self) -> None: self._sent_ts = None self._ttfb_sent = False - def mark_needs_reconnect(self) -> None: - """Called by extension when request_id changes.""" - self._needs_reconnect = True - async def get( self, text: str ) -> AsyncIterator[tuple[bytes | int | None, int]]: @@ -165,7 +164,7 @@ async def get( try: message = await asyncio.wait_for( - self._ws.recv(), timeout=8.0 + self._ws.recv(), timeout=WS_RECV_TIMEOUT ) except asyncio.TimeoutError: self.ten_env.log_error("Timeout waiting for Deepgram audio") @@ -215,6 +214,7 @@ async def get( self.ten_env.log_error( f"Deepgram error: {error_msg}" ) + self._needs_reconnect = True yield ( error_msg.encode("utf-8"), EVENT_TTS_ERROR, diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py index ad527ac716..1c7ff2417a 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py @@ -307,6 +307,7 @@ async def _process_tts_text(self, text: str, t: TTSTextInput) -> None: await self._finalize_request( TTSAudioEndReason.REQUEST_END ) + break elif event_status == EVENT_TTS_TTFB_METRIC: if data_msg is not None and isinstance(data_msg, int): diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py index a6307bc1e6..b78b3b43aa 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py @@ -47,7 +47,6 @@ def _create_mock_client(): mock.stop = AsyncMock() mock.cancel = AsyncMock() mock.reset_ttfb = lambda: None - mock.mark_needs_reconnect = lambda: None fake_audio = b"\x00\x01" * 200 @@ -198,7 +197,6 @@ def create_mock(): mock.stop = AsyncMock() mock.cancel = AsyncMock() mock.reset_ttfb = lambda: None - mock.mark_needs_reconnect = lambda: None fake_audio = b"\x00\x01" * 200 @@ -332,7 +330,6 @@ def test_auth_error_single_emission(MockClient): mock.stop = AsyncMock() mock.cancel = AsyncMock() mock.reset_ttfb = lambda: None - mock.mark_needs_reconnect = lambda: None async def mock_get_auth_fail(text): raise DeepgramTTSConnectionException( @@ -407,7 +404,6 @@ def create_mock(): mock.stop = AsyncMock() mock.cancel = AsyncMock() mock.reset_ttfb = lambda: None - mock.mark_needs_reconnect = lambda: None fake_audio = b"\x00\x01" * 200 From 35e9b110b6767ab152a3d1c1423ad4ce57fea1ff Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 16:07:42 +0000 Subject: [PATCH 13/18] fix: cancel finalization, exception cleanup, test bootstrap - cancel_tts() now always calls _finalize_request() when current_request_id is set, regardless of sent_ts. prevents downstream consumers hanging when cancel arrives before first text is processed. - simplify redundant except (asyncio.TimeoutError, Exception) to except Exception. - move sys.path bootstrap to conftest.py, remove from all 6 test files. license headers now appear first as per repo style. - remove unused import (copy) from test_state_machine.py. --- .../extension/deepgram_tts/deepgram_tts.py | 2 +- .../extension/deepgram_tts/extension.py | 6 +-- .../extension/deepgram_tts/tests/conftest.py | 8 +++ .../deepgram_tts/tests/test_basic.py | 9 ---- .../deepgram_tts/tests/test_error_msg.py | 8 --- .../deepgram_tts/tests/test_metrics.py | 8 --- .../deepgram_tts/tests/test_params.py | 54 ++++++++----------- .../deepgram_tts/tests/test_robustness.py | 32 ++++------- .../deepgram_tts/tests/test_state_machine.py | 17 ++---- 9 files changed, 48 insertions(+), 96 deletions(-) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py index 4dd87b5362..060e726b3c 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py @@ -106,7 +106,7 @@ async def cancel(self) -> None: await self._ws.send(json.dumps({"type": "Flush"})) # Drain until Flushed to leave connection clean await asyncio.wait_for(self._drain_until_flushed(), timeout=3.0) - except (asyncio.TimeoutError, Exception) as e: + except Exception as e: self.ten_env.log_warn( f"Cancel drain failed: {e}, " "will reconnect on next request" diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py index 1c7ff2417a..20129d6f9e 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py @@ -112,13 +112,11 @@ async def cancel_tts(self) -> None: self.current_request_finished = True if self.current_request_id: self.ten_env.log_debug( - f"Current request {self.current_request_id} " - f"is being cancelled. Sending INTERRUPTED." + f"Cancelling request {self.current_request_id}" ) if self.client: await self.client.cancel() - if self.sent_ts: - await self._finalize_request(TTSAudioEndReason.INTERRUPTED) + await self._finalize_request(TTSAudioEndReason.INTERRUPTED) else: self.ten_env.log_warn("No current request, skipping cancel.") diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/conftest.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/conftest.py index 001977148c..958647c64d 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/conftest.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/conftest.py @@ -1,3 +1,11 @@ +import sys +from pathlib import Path + +# Add project root to sys.path for test imports +project_root = str(Path(__file__).resolve().parents[6]) +if project_root not in sys.path: + sys.path.insert(0, project_root) + # # This file is part of TEN Framework, an open source project. # Licensed under the Apache License, Version 2.0. diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py index 230f65f9ce..2f001d17f3 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_basic.py @@ -1,12 +1,3 @@ -import sys -from pathlib import Path - -# Add project root to sys.path to allow running tests from this directory -# The project root is 6 levels up from the parent directory of this file. -project_root = str(Path(__file__).resolve().parents[6]) -if project_root not in sys.path: - sys.path.insert(0, project_root) - # # This file is part of TEN Framework, an open source project. # Licensed under the Apache License, Version 2.0. diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_error_msg.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_error_msg.py index 26e5cccf70..f194ca34cc 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_error_msg.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_error_msg.py @@ -1,11 +1,3 @@ -import sys -from pathlib import Path - -# Add project root to sys.path -project_root = str(Path(__file__).resolve().parents[6]) -if project_root not in sys.path: - sys.path.insert(0, project_root) - # # This file is part of TEN Framework, an open source project. # Licensed under the Apache License, Version 2.0. diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_metrics.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_metrics.py index 3705c130f4..60d7cdfe20 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_metrics.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_metrics.py @@ -1,11 +1,3 @@ -import sys -from pathlib import Path - -# Add project root to sys.path -project_root = str(Path(__file__).resolve().parents[6]) -if project_root not in sys.path: - sys.path.insert(0, project_root) - # # This file is part of TEN Framework, an open source project. # Licensed under the Apache License, Version 2.0. diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_params.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_params.py index d597cd6a52..aded961fde 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_params.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_params.py @@ -1,11 +1,3 @@ -import sys -from pathlib import Path - -# Add project root to sys.path -project_root = str(Path(__file__).resolve().parents[6]) -if project_root not in sys.path: - sys.path.insert(0, project_root) - # # This file is part of TEN Framework, an open source project. # Licensed under the Apache License, Version 2.0. @@ -14,6 +6,7 @@ import json from unittest.mock import patch, AsyncMock + from ten_runtime import ( ExtensionTester, TenEnvTester, @@ -25,6 +18,24 @@ EVENT_TTS_END, EVENT_TTS_TTFB_METRIC, ) +from unittest.mock import MagicMock + + +def create_mock_client(): + mock = MagicMock() + mock.start = AsyncMock() + mock.stop = AsyncMock() + mock.cancel = AsyncMock() + mock.reset_ttfb = lambda: None + fake_audio = b"\x00\x01\x02\x03" * 100 + + async def mock_get(text): + yield (100, EVENT_TTS_TTFB_METRIC) + yield (fake_audio, EVENT_TTS_RESPONSE) + yield (None, EVENT_TTS_END) + + mock.get.side_effect = mock_get + return mock # ================ test different sample rates ================ @@ -58,31 +69,10 @@ def on_audio_frame(self, ten_env: TenEnvTester, audio_frame): self.audio_chunks_count += 1 -def _create_mock_client(): - """Helper to create a mock client for tests.""" - from unittest.mock import MagicMock - - mock = MagicMock() - mock.start = AsyncMock() - mock.stop = AsyncMock() - mock.cancel = AsyncMock() - mock.reset_ttfb = lambda: None - - fake_audio_chunk = b"\x00\x01\x02\x03" * 100 - - async def mock_get_audio_stream(text: str): - yield (100, EVENT_TTS_TTFB_METRIC) - yield (fake_audio_chunk, EVENT_TTS_RESPONSE) - yield (None, EVENT_TTS_END) - - mock.get.side_effect = mock_get_audio_stream - return mock - - @patch("deepgram_tts.extension.DeepgramTTSClient") def test_sample_rate_16000(MockDeepgramTTSClient): """Test with 16000 Hz sample rate.""" - MockDeepgramTTSClient.return_value = _create_mock_client() + MockDeepgramTTSClient.return_value = create_mock_client() tester = ExtensionTesterSampleRate(16000) tester.set_test_mode_single( @@ -108,7 +98,7 @@ def test_sample_rate_16000(MockDeepgramTTSClient): @patch("deepgram_tts.extension.DeepgramTTSClient") def test_sample_rate_24000(MockDeepgramTTSClient): """Test with 24000 Hz sample rate.""" - MockDeepgramTTSClient.return_value = _create_mock_client() + MockDeepgramTTSClient.return_value = create_mock_client() tester = ExtensionTesterSampleRate(24000) tester.set_test_mode_single( @@ -134,7 +124,7 @@ def test_sample_rate_24000(MockDeepgramTTSClient): @patch("deepgram_tts.extension.DeepgramTTSClient") def test_sample_rate_48000(MockDeepgramTTSClient): """Test with 48000 Hz sample rate.""" - MockDeepgramTTSClient.return_value = _create_mock_client() + MockDeepgramTTSClient.return_value = create_mock_client() tester = ExtensionTesterSampleRate(48000) tester.set_test_mode_single( diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_robustness.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_robustness.py index b807fe5834..6191c8f14a 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_robustness.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_robustness.py @@ -1,11 +1,3 @@ -import sys -from pathlib import Path - -# Add project root to sys.path -project_root = str(Path(__file__).resolve().parents[6]) -if project_root not in sys.path: - sys.path.insert(0, project_root) - # # This file is part of TEN Framework, an open source project. # Licensed under the Apache License, Version 2.0. @@ -14,6 +6,7 @@ import json from unittest.mock import patch, AsyncMock + from ten_runtime import ( ExtensionTester, TenEnvTester, @@ -25,26 +18,23 @@ EVENT_TTS_END, EVENT_TTS_TTFB_METRIC, ) +from unittest.mock import MagicMock -def _create_mock_client(): - """Helper to create a mock client for tests.""" - from unittest.mock import MagicMock - +def create_mock_client(): mock = MagicMock() mock.start = AsyncMock() mock.stop = AsyncMock() mock.cancel = AsyncMock() mock.reset_ttfb = lambda: None + fake_audio = b"\x00\x01\x02\x03" * 100 - fake_audio_chunk = b"\x00\x01\x02\x03" * 100 - - async def mock_get_audio_stream(text: str): + async def mock_get(text): yield (100, EVENT_TTS_TTFB_METRIC) - yield (fake_audio_chunk, EVENT_TTS_RESPONSE) + yield (fake_audio, EVENT_TTS_RESPONSE) yield (None, EVENT_TTS_END) - mock.get.side_effect = mock_get_audio_stream + mock.get.side_effect = mock_get return mock @@ -78,7 +68,7 @@ def on_data(self, ten_env: TenEnvTester, data) -> None: @patch("deepgram_tts.extension.DeepgramTTSClient") def test_empty_text(MockDeepgramTTSClient): """Test that empty text is handled gracefully.""" - MockDeepgramTTSClient.return_value = _create_mock_client() + MockDeepgramTTSClient.return_value = create_mock_client() tester = ExtensionTesterEmptyText() tester.set_test_mode_single( @@ -132,7 +122,7 @@ def on_data(self, ten_env: TenEnvTester, data) -> None: @patch("deepgram_tts.extension.DeepgramTTSClient") def test_whitespace_text(MockDeepgramTTSClient): """Test that whitespace-only text is handled gracefully.""" - MockDeepgramTTSClient.return_value = _create_mock_client() + MockDeepgramTTSClient.return_value = create_mock_client() tester = ExtensionTesterWhitespaceText() tester.set_test_mode_single( @@ -192,7 +182,7 @@ def on_audio_frame(self, ten_env: TenEnvTester, audio_frame): @patch("deepgram_tts.extension.DeepgramTTSClient") def test_long_text(MockDeepgramTTSClient): """Test that long text is handled correctly.""" - MockDeepgramTTSClient.return_value = _create_mock_client() + MockDeepgramTTSClient.return_value = create_mock_client() tester = ExtensionTesterLongText() tester.set_test_mode_single( @@ -252,7 +242,7 @@ def on_data(self, ten_env: TenEnvTester, data) -> None: @patch("deepgram_tts.extension.DeepgramTTSClient") def test_special_characters(MockDeepgramTTSClient): """Test that special characters are handled correctly.""" - MockDeepgramTTSClient.return_value = _create_mock_client() + MockDeepgramTTSClient.return_value = create_mock_client() tester = ExtensionTesterSpecialChars() tester.set_test_mode_single( diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py index b78b3b43aa..12650f9d2c 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_state_machine.py @@ -1,21 +1,13 @@ -import sys -from pathlib import Path - -# Add project root to sys.path -project_root = str(Path(__file__).resolve().parents[6]) -if project_root not in sys.path: - sys.path.insert(0, project_root) - # # This file is part of TEN Framework, an open source project. # Licensed under the Apache License, Version 2.0. # See the LICENSE file for more information. # import asyncio -import copy import json from unittest.mock import patch, AsyncMock, MagicMock + from ten_runtime import ( ExtensionTester, TenEnvTester, @@ -41,14 +33,13 @@ } -def _create_mock_client(): +def create_mock_client(): mock = MagicMock() mock.start = AsyncMock() mock.stop = AsyncMock() mock.cancel = AsyncMock() mock.reset_ttfb = lambda: None - - fake_audio = b"\x00\x01" * 200 + fake_audio = b"\x00\x01\x02\x03" * 100 async def mock_get(text): yield (100, EVENT_TTS_TTFB_METRIC) @@ -120,7 +111,7 @@ def on_data(self, ten_env: TenEnvTester, data) -> None: def test_sequential_requests(MockClient): """Each sequential request should complete with its own request_id in audio_start and audio_end.""" - MockClient.return_value = _create_mock_client() + MockClient.return_value = create_mock_client() tester = SequentialRequestsTester() tester.set_test_mode_single("deepgram_tts", json.dumps(MOCK_CONFIG)) From cc9fbad07b0fa9418e62f2df080496b335334e2b Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 16:20:28 +0000 Subject: [PATCH 14/18] chore: remove progressive disclosure docs from deepgram tts PR scope moved to separate PR #2132 (docs/progressive-disclosure branch). these repo-wide AI documentation files are a cross-cutting concern independent of the deepgram tts extension. --- AGENTS.md | 26 - CLAUDE.md | 1 - docs/ai/L0_repo_card.md | 26 - docs/ai/L1/01_setup.md | 118 ---- docs/ai/L1/02_architecture.md | 142 ---- docs/ai/L1/03_code_map.md | 117 ---- docs/ai/L1/04_conventions.md | 138 ---- docs/ai/L1/05_workflows.md | 166 ----- docs/ai/L1/06_interfaces.md | 150 ---- docs/ai/L1/07_gotchas.md | 117 ---- docs/ai/L1/08_security.md | 88 --- docs/ai/L1/deep_dives/_index.md | 10 - docs/ai/L1/deep_dives/deployment.md | 206 ------ .../ai/L1/deep_dives/extension_development.md | 653 ------------------ docs/ai/L1/deep_dives/graph_configuration.md | 410 ----------- docs/ai/L1/deep_dives/operations_restarts.md | 191 ----- docs/ai/L1/deep_dives/server_architecture.md | 211 ------ docs/ai/L1/deep_dives/testing.md | 305 -------- 18 files changed, 3075 deletions(-) delete mode 100644 AGENTS.md delete mode 100644 CLAUDE.md delete mode 100644 docs/ai/L0_repo_card.md delete mode 100644 docs/ai/L1/01_setup.md delete mode 100644 docs/ai/L1/02_architecture.md delete mode 100644 docs/ai/L1/03_code_map.md delete mode 100644 docs/ai/L1/04_conventions.md delete mode 100644 docs/ai/L1/05_workflows.md delete mode 100644 docs/ai/L1/06_interfaces.md delete mode 100644 docs/ai/L1/07_gotchas.md delete mode 100644 docs/ai/L1/08_security.md delete mode 100644 docs/ai/L1/deep_dives/_index.md delete mode 100644 docs/ai/L1/deep_dives/deployment.md delete mode 100644 docs/ai/L1/deep_dives/extension_development.md delete mode 100644 docs/ai/L1/deep_dives/graph_configuration.md delete mode 100644 docs/ai/L1/deep_dives/operations_restarts.md delete mode 100644 docs/ai/L1/deep_dives/server_architecture.md delete mode 100644 docs/ai/L1/deep_dives/testing.md diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index d23c0aa719..0000000000 --- a/AGENTS.md +++ /dev/null @@ -1,26 +0,0 @@ -# AI Agent Instructions - -This repository uses progressive disclosure documentation to help AI coding -agents work efficiently. Documentation is structured in three levels under -`docs/ai/`. - -## How to Load - -1. Read [docs/ai/L0_repo_card.md](docs/ai/L0_repo_card.md) to identify the repo. -2. Load ALL 8 files in `docs/ai/L1/`. They are small — load all of them upfront. - This gives you setup, architecture, code map, conventions, workflows, - interfaces, gotchas, and security. -3. If a task needs more detail than L1 provides, follow links to L2 deep dives - in `docs/ai/L1/deep_dives/`. Load only the specific L2 file you need. - -## Levels - -- **L0 (Repo Card):** Identity and L1 index. Table of contents. -- **L1 (Summaries):** Eight structured summaries. Load all at session start. -- **L2 (Deep Dives):** Full specifications. Load only when L1 isn't detailed enough. - -## Working Areas - -- **AI Agents development**: `ai_agents/` — see `ai_agents/AGENTS.md` for workspace-specific context -- **Core framework**: `core/`, `packages/`, `build/` -- **Operational reference**: `ai/AI_working_with_ten.md` (full), `ai/AI_working_with_ten_compact.md` (quick) diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index c2c4fb4158..0000000000 --- a/CLAUDE.md +++ /dev/null @@ -1 +0,0 @@ -Read @AGENTS.md for AI agent instructions and progressive disclosure docs. diff --git a/docs/ai/L0_repo_card.md b/docs/ai/L0_repo_card.md deleted file mode 100644 index 288312322b..0000000000 --- a/docs/ai/L0_repo_card.md +++ /dev/null @@ -1,26 +0,0 @@ -# TEN Framework — Repo Card - -## Identity - -| Field | Value | -| ------------- | -------------------------------------------------------------------- | -| Repo | `TEN-framework/TEN-Agent` | -| Description | Open-source platform for building real-time multimodal AI agents | -| Repo Type | `distributed-system` | -| Language | Python (extensions), Go (API server), TypeScript/React (playground) | -| Deploy Target | Docker container (`ten_agent_dev`), Taskfile-based build | -| Owner | TEN Framework team | -| Last Reviewed | 2026-04-07 | - -## L1 Index - -| File | Purpose | Audience | -| ---------------------------------------- | -------------------------------------------------------- | -------- | -| [01_setup](L1/01_setup.md) | Docker, .env, ports, health checks, restart procedures | both | -| [02_architecture](L1/02_architecture.md) | Extensions, graphs, connections, RTC-first design | both | -| [03_code_map](L1/03_code_map.md) | Directory tree, key files, base classes, 93+ extensions | both | -| [04_conventions](L1/04_conventions.md) | Naming, Pydantic configs, params pattern, formatting | both | -| [05_workflows](L1/05_workflows.md) | Create extension, modify graph, test, restart, deploy | both | -| [06_interfaces](L1/06_interfaces.md) | REST API, connection schemas, base class abstract methods| both | -| [07_gotchas](L1/07_gotchas.md) | Property tuples, signal handlers, zombies, .env timing | both | -| [08_security](L1/08_security.md) | API keys, .env, sensitive logging, git hooks | both | diff --git a/docs/ai/L1/01_setup.md b/docs/ai/L1/01_setup.md deleted file mode 100644 index c6003da202..0000000000 --- a/docs/ai/L1/01_setup.md +++ /dev/null @@ -1,118 +0,0 @@ -# 01 Setup - -> Environment setup, local development, and quick commands for TEN Framework AI Agents. - -## Prerequisites - -| Requirement | Version / Notes | -| ----------------- | ------------------------------------------------------------ | -| Docker + Compose | Required for container-based development | -| Node.js | LTS v18+ on host; container has Node 22 | -| API Keys | Agora App ID, OpenAI, Deepgram ASR, ElevenLabs TTS (minimum)| -| Hardware | 2+ CPU cores, 4 GB RAM minimum | - -## Docker Container - -```bash -cd /home/ubuntu/ten-framework/ai_agents -docker compose up -d -docker ps | grep ten_agent_dev # Verify running -``` - -Container image: `ghcr.io/ten-framework/ten_agent_build:0.7.14` - -## Environment Variables - -**Single .env file**: `ai_agents/.env` — the ONLY source of environment config. - -| Variable | Purpose | Required | -| ---------------------------- | ---------------------------- | -------- | -| `AGORA_APP_ID` | Agora RTC app identifier | Yes | -| `AGORA_APP_CERTIFICATE` | Agora RTC certificate | No | -| `OPENAI_API_KEY` | LLM provider | Yes | -| `OPENAI_MODEL` | Model name (e.g., `gpt-4o`) | Yes | -| `DEEPGRAM_API_KEY` | ASR provider | Yes | -| `ELEVENLABS_TTS_KEY` | TTS provider | Yes | -| `LOG_STDOUT` | Worker log visibility | Yes (`true`) | -| `SERVER_PORT` | API server port | Yes (`8080`) | -| `WORKERS_MAX` | Max concurrent sessions | Yes (`100`) | -| `WORKER_QUIT_TIMEOUT_SECONDS`| Worker idle timeout | Yes (`60`) | - -See `.env.example` for the complete list. Extensions may require additional keys -(Azure, AWS, Rime, etc.) — check extension README files. - -## Install and Run - -```bash -# 1. Install Python dependencies (NOT persisted across container restarts) -docker exec ten_agent_dev bash -c \ - "cd /app/agents/examples/voice-assistant-advanced/tenapp && \ - bash scripts/install_python_deps.sh" - -# 2. Build and install (5-8 minutes first time) -docker exec ten_agent_dev bash -c \ - "cd /app/agents/examples/voice-assistant-advanced && task install" - -# 3. Start everything (API server + playground + TMAN Designer) -docker exec -d ten_agent_dev bash -c \ - "cd /app/agents/examples/voice-assistant-advanced && \ - task run > /tmp/task_run.log 2>&1" -``` - -**CRITICAL**: Always use `task run` to start — never run `./bin/api` directly. - -## Ports - -| Port | Service | -| ----- | ---------------- | -| 8080 | Go API server | -| 3000 | Playground (Next.js) | -| 49483 | TMAN Designer | - -## Health Checks - -```bash -curl -s http://localhost:8080/health -# {"code":"0","data":null,"msg":"ok"} - -curl -s http://localhost:8080/graphs | jq -r '.data[].name' -# voice_assistant, voice_assistant_heygen, etc. -``` - -## Restart Procedures - -| What Changed | Container? | Server? | Frontend? | -| ------------------------------- | ---------- | ----------------- | ----------------- | -| `property.json` (graphs added) | No | Nuclear restart | Nuclear restart | -| `property.json` (config only) | No | No | No | -| `.env` file | Yes | Yes | No | -| Python extension code | No | Yes | No | -| Go server code | No | Yes + `task install` | No | - -**Nuclear restart** (safest after graph changes): - -```bash -sudo docker exec ten_agent_dev bash -c "pkill -9 -f 'bin/api'; pkill -9 node; pkill -9 bun" -sudo docker exec ten_agent_dev bash -c "rm -f /app/playground/.next/dev/lock" -sleep 2 -sudo docker exec -d ten_agent_dev bash -c \ - "cd /app/agents/examples/voice-assistant-advanced && task run > /tmp/task_run.log 2>&1" -``` - -**After container restart**: always reinstall Python deps, then `task run`. - -**After .env changes**: `docker compose down && docker compose up -d`, reinstall deps, `task run`. - -## Logs - -```bash -# All logs (inside container) -docker exec ten_agent_dev tail -f /tmp/task_run.log - -# Filter by extension or channel -docker exec ten_agent_dev tail -f /tmp/task_run.log | grep --line-buffered "deepgram" -``` - -## Related Deep Dives - -- [Deployment](deep_dives/deployment.md) — Docker Compose, Cloudflare tunnel, Nginx, Grafana monitoring diff --git a/docs/ai/L1/02_architecture.md b/docs/ai/L1/02_architecture.md deleted file mode 100644 index c35f537d7e..0000000000 --- a/docs/ai/L1/02_architecture.md +++ /dev/null @@ -1,142 +0,0 @@ -# 02 Architecture - -> System design overview: extensions, graphs, connections, and the server-worker model. - -## TEN Ecosystem - -| Component | Purpose | -| --------------------- | ------------------------------------------------------ | -| TEN Framework | Core runtime (C/C++, Go, Python, Node.js bindings) | -| TEN Agent Examples | Pre-built agent configurations (this repo's `ai_agents/`) | -| TEN VAD | Low-latency voice activity detection | -| TEN Turn Detection | Full-duplex dialogue management | -| TEN Portal | Documentation and blog site | - -## Extension System - -Extensions are modular components that process data — ASR, TTS, LLM, tools, RTC, avatars, etc. -Each extension has a lifecycle: - -``` -on_init() → on_start() → [process messages] → on_stop() → on_deinit() -``` - -Every extension contains: - -| File | Purpose | -| ----------------- | ------------------------------------------ | -| `addon.py` | Registration via `@register_addon_as_extension` | -| `extension.py` | Main logic, inherits from a base class | -| `manifest.json` | Metadata, dependencies, API interface | -| `property.json` | Default configuration values | - -**Base classes** (in `ten_ai_base/interface/ten_ai_base/`): - -| Base Class | Use For | -| ----------------------------- | ----------------- | -| `AsyncASRBaseExtension` | Speech-to-text | -| `AsyncTTS2BaseExtension` | Text-to-speech | -| `AsyncLLMBaseExtension` | Chat completion | -| `AsyncLLMToolBaseExtension` | LLM function tools| -| `AsyncExtension` | Generic / custom | - -## Graph-Based Configuration - -Agents are assembled by defining **graphs** in `property.json`. A graph specifies -which extensions run (nodes) and how data flows between them (connections). - -```json -{ - "predefined_graphs": [{ - "name": "voice_assistant", - "auto_start": true, - "graph": { - "nodes": [ - {"type": "extension", "name": "stt", "addon": "deepgram_asr_python", "property": {}}, - {"type": "extension", "name": "llm", "addon": "openai_llm2_python", "property": {}}, - {"type": "extension", "name": "tts", "addon": "elevenlabs_tts2_python", "property": {}} - ], - "connections": [...] - } - }] -} -``` - -## Connection Types - -| Type | Payload | Example | -| ------------- | -------------------- | -------------------------------------------- | -| `cmd` | Named commands | `tool_register`, `on_user_joined`, `flush` | -| `data` | Named data messages | `asr_result`, `text_data`, `tts_text_input` | -| `audio_frame` | PCM audio streams | `pcm_frame` (16-bit, mono, 16/24/48 kHz) | -| `video_frame` | Video streams | Raw video frames for vision/avatar | - -## RTC-First Design - -TEN uses Agora RTC (Real-Time Communication) as the default transport, not WebSockets. - -| Aspect | RTC (default) | WebSocket | -| ---------------- | -------------------------------- | -------------------------- | -| Latency | 50-150ms (UDP-based) | Higher (TCP-based) | -| Codec support | Opus, VP8, VP9, AV1 | Raw PCM only | -| Bandwidth adapt | Built-in adaptation + FEC | Manual implementation | -| Use case | Real-time voice/video | Signaling, configuration | - -WebSockets are used for signaling and configuration; RTC handles the media path. - -## Server-Worker Model - -``` -┌─────────────────┐ ┌──────────────────┐ -│ Go HTTP Server │────▶│ Worker Process │ (one per session) -│ (port 8080) │ │ (tman run start) │ -│ │ │ │ -│ /start → spawn │ │ Loads graph from │ -│ /stop → kill │ │ property.json │ -│ /ping → keep │ │ Runs extensions │ -└─────────────────┘ └──────────────────┘ -``` - -- **POST /start** spawns a worker process for a channel/session -- **POST /stop** terminates the worker -- **POST /ping** keeps the session alive (if timeout != -1) - -## Property Injection - -When `/start` is called, the server auto-injects dynamic values into the graph: - -- `channel_name` → injected into every node that has a `"channel"` property -- `remote_stream_id`, `bot_stream_id`, `token` → injected via `startPropMap` -- `req.Properties[extensionName]` → merged into specific node properties - -This is future-proof: any new extension with a "channel" property automatically -receives the dynamic channel value without code changes. - -## Component Diagram - -``` - Client (Browser/Mobile) - │ - ▼ - ┌──────────────┐ - │ Playground │ Next.js frontend (port 3000) - │ (UI) │ - └──────┬───────┘ - │ REST API - ▼ - ┌──────────────┐ ┌──────────────────────────────────┐ - │ Go Server │──spawn─▶│ Worker Process │ - │ (port 8080) │ │ ┌─────┐ ┌─────┐ ┌─────┐ │ - │ │ │ │ ASR │─▶│ LLM │─▶│ TTS │ │ - │ │ │ └──┬──┘ └─────┘ └──┬──┘ │ - └──────────────┘ │ │ │ │ - │ ┌──┴───────────────────┴──┐ │ - │ │ Agora RTC │ │ - │ └─────────────────────────┘ │ - └──────────────────────────────────┘ -``` - -## Related Deep Dives - -- [Server Architecture](deep_dives/server_architecture.md) — Go server internals, property injection pipeline -- [Graph Configuration](deep_dives/graph_configuration.md) — Node schema, connection wiring, parallel routing diff --git a/docs/ai/L1/03_code_map.md b/docs/ai/L1/03_code_map.md deleted file mode 100644 index dd60723726..0000000000 --- a/docs/ai/L1/03_code_map.md +++ /dev/null @@ -1,117 +0,0 @@ -# 03 Code Map - -> Directory tree, module responsibilities, and key file locations. - -## Top-Level Structure - -All AI agent development happens inside `ai_agents/`: - -``` -ai_agents/ -├── agents/ -│ ├── ten_packages/ -│ │ ├── extension/ # 93+ extensions (ASR, TTS, LLM, tools, avatar) -│ │ └── system/ # Core runtime packages -│ │ ├── ten_ai_base/ # Base classes and API interface definitions -│ │ ├── ten_runtime_python/ -│ │ └── ten_runtime_go/ -│ ├── examples/ # 24+ example agent configurations -│ │ ├── voice-assistant/ -│ │ ├── voice-assistant-advanced/ -│ │ ├── voice-assistant-realtime/ -│ │ ├── voice-assistant-video/ -│ │ ├── doodler/ -│ │ └── ... -│ ├── integration_tests/ # Test frameworks -│ │ ├── asr_guarder/ # ASR integration tests -│ │ └── tts_guarder/ # TTS integration tests -│ └── scripts/ # Build and packaging scripts -├── server/ # Go API server -│ ├── main.go -│ └── internal/ -│ ├── http_server.go # REST endpoints, property injection -│ └── config.go # Parameter mapping (startPropMap) -├── playground/ # Next.js frontend UI (port 3000) -│ └── src/ # React components -├── esp32-client/ # ESP32 hardware client -├── Taskfile.yml # Root-level build/test tasks -├── docker-compose.yml # Container config -├── .env # Environment variables (single source) -└── .env.example # Template with all variables -``` - -Other repo-root directories: `core/` (C runtime), `packages/` (example/core extensions), -`docs/` (framework docs), `tools/` (Grafana monitoring, profilers). - -## Extension Categories - -| Category | Count | Examples | -| --------- | ----- | ----------------------------------------------------------- | -| ASR | 10+ | `deepgram_asr_python`, `azure_asr_python`, `aws_asr_python` | -| TTS | 15+ | `deepgram_tts`, `elevenlabs_tts2_python`, `cartesia_tts` | -| LLM | 8+ | `openai_llm2_python`, `gemini_llm2_python`, `bedrock_llm_python` | -| Avatar | 5+ | `heygen_avatar_python`, `anam_avatar_python` | -| Tools | 8+ | `bingsearch_tool_python`, `vision_tool_python` | -| Transport | 3+ | `agora_rtc`, `websocket_server`, `http_server_python` | -| Other | 10+ | `message_collector2`, `ten_vad_python`, `mcp_client_python` | - -## Extension File Structure - -Every extension follows this layout: - -| File | Purpose | -| ------------------ | ---------------------------------------------- | -| `__init__.py` | Package marker | -| `addon.py` | `@register_addon_as_extension` registration | -| `extension.py` | Main logic, inherits from base class | -| `config.py` | Pydantic config model (optional but common) | -| `manifest.json` | Metadata, dependencies, API interface imports | -| `property.json` | Default config values with `${env:VAR}` syntax | -| `requirements.txt` | Python dependencies | -| `README.md` | Usage documentation (often multilingual) | -| `tests/` | Standalone tests with `bin/start` entry point | - -## Base Classes - -Located in example tenapp directories under `ten_packages/system/ten_ai_base/interface/ten_ai_base/`: - -| File | Class | Purpose | -| ---------- | ---------------------------- | -------------------------- | -| `asr.py` | `AsyncASRBaseExtension` | Speech recognition | -| `tts.py` | `AsyncTTSBaseExtension` | Text-to-speech (basic) | -| `tts2.py` | `AsyncTTS2BaseExtension` | Text-to-speech (advanced) | -| `llm.py` | `AsyncLLMBaseExtension` | Language model completion | -| `llm2.py` | `AsyncLLM2BaseExtension` | Language model v2 | -| `llm_tool.py` | `AsyncLLMToolBaseExtension` | LLM function calling tools | -| `mllm.py` | `AsyncMLLMBaseExtension` | Multimodal LLM | - -## API Interface Definitions - -Standard interfaces in `ten_ai_base/api/`: - -| File | Defines | -| ----------------------- | --------------------------------- | -| `asr-interface.json` | ASR data/cmd/audio_frame schemas | -| `tts-interface.json` | TTS data/cmd/audio_frame schemas | -| `llm-interface.json` | LLM data/cmd schemas | -| `mllm-interface.json` | Multimodal LLM schemas | - -Extensions reference these via `manifest.json`: -```json -{"api": {"interface": [{"import_uri": "../../system/ten_ai_base/api/tts-interface.json"}]}} -``` - -## Key Files Quick Reference - -| When working on... | Look at | -| -------------------------- | -------------------------------------------------- | -| New extension | Similar extension in `agents/ten_packages/extension/` | -| API interface changes | `ten_ai_base/api/*.json` | -| Graph configuration | `agents/examples/*/tenapp/property.json` | -| Server endpoints | `server/internal/http_server.go` | -| Build/test tasks | `Taskfile.yml` (root) and per-example | -| Test setup | `agents/ten_packages/extension/*/tests/bin/start` | - -## Related Deep Dives - -- [Extension Development](deep_dives/extension_development.md) — Full creation guide with base class details diff --git a/docs/ai/L1/04_conventions.md b/docs/ai/L1/04_conventions.md deleted file mode 100644 index 41f11901f8..0000000000 --- a/docs/ai/L1/04_conventions.md +++ /dev/null @@ -1,138 +0,0 @@ -# 04 Conventions - -> Coding patterns, naming, configuration, and formatting standards. - -## Naming Conventions - -| Item | Pattern | Example | -| --------------- | -------------------------------------- | -------------------------- | -| Extension dir | `__python` | `deepgram_asr_python` | -| Addon name | Same as directory name | `deepgram_asr_python` | -| Example dir | `voice-assistant-` | `voice-assistant-realtime` | -| Config class | `Config(BaseModel)` | `DeepgramTTSConfig` | -| Client class | `Client` | `DeepgramTTSClient` | - -## Addon Registration - -Every extension must register via decorator in `addon.py`: - -```python -from ten_runtime import Addon, register_addon_as_extension, TenEnv - -@register_addon_as_extension("deepgram_asr_python") -class DeepgramASRExtensionAddon(Addon): - def on_create_instance(self, ten: TenEnv, addon_name: str, context) -> None: - ten.on_create_instance_done(DeepgramASRExtension(addon_name), context) -``` - -The decorator name **must match** the `addon` field in `property.json` graph nodes. - -## Base Class Selection - -| Need | Base Class | Key Abstract Methods | -| ----------------------- | ----------------------------- | ------------------------------------- | -| Speech-to-text | `AsyncASRBaseExtension` | `vendor()`, `start_connection()`, `send_audio()`, `finalize()` | -| Text-to-speech (HTTP) | `AsyncTTS2HttpExtension` | `vendor()`, `request_tts()`, `synthesize_audio_sample_rate()` | -| Text-to-speech (WS) | `AsyncTTS2BaseExtension` | `vendor()`, `request_tts()`, `cancel_tts()` | -| Chat completion | `AsyncLLMBaseExtension` | `on_call_chat_completion()`, `on_data_chat_completion()` | -| LLM function tool | `AsyncLLMToolBaseExtension` | `get_tool_metadata()`, `run_tool()` | -| Generic / custom | `AsyncExtension` | `on_cmd()`, `on_data()`, etc. | - -## Pydantic Configuration - -Extensions use Pydantic models for config validation: - -```python -from pydantic import BaseModel, Field - -class DeepgramTTSConfig(BaseModel): - api_key: str = "" - model: str = "aura-2-theia-en" - sample_rate: int = 24000 - params: dict[str, Any] = Field(default_factory=dict) -``` - -Config is loaded from property.json in `on_init()`: -```python -config_json, _ = await ten_env.get_property_to_json("") -self.config = DeepgramTTSConfig(**json.loads(config_json)) -``` - -## Environment Variable Syntax - -In `property.json`, reference env vars: - -| Syntax | Behavior | -| --------------------- | --------------------------------------- | -| `${env:VAR_NAME}` | Required — error if missing | -| `${env:VAR_NAME\|}` | Optional — empty string if missing | -| `${env:VAR_NAME\|default}` | Optional — uses default if missing | - -```json -{"api_key": "${env:DEEPGRAM_API_KEY}", "region": "${env:AZURE_REGION|}"} -``` - -## Params Dict Pattern - -Extensions using HTTP/WebSocket services store all config in a `params` dictionary: - -1. **Store** `api_key` inside `params` dict in property.json and config -2. **Extract** for authentication headers in the client constructor -3. **Strip** from params **only when creating the HTTP request payload** - -```python -# In client constructor — extract for auth -self.api_key = config.params.get("api_key", "") -self.headers = {"Authorization": f"Bearer {self.api_key}"} - -# In request method — strip before sending -payload = {**self.config.params} -payload.pop("api_key", None) -``` - -## Sensitive Data Logging - -Implement `to_str()` to encrypt sensitive fields before logging: - -```python -def to_str(self, sensitive_handling: bool = True) -> str: - if not sensitive_handling: - return f"{self}" - config = copy.deepcopy(self) - if config.params and "api_key" in config.params: - config.params["api_key"] = utils.encrypt(config.params["api_key"]) - return f"{config}" -``` - -## Logging - -- Use `ten_env.log_info()`, `ten_env.log_warn()`, `ten_env.log_error()`, `ten_env.log_debug()` -- Categories: `LOG_CATEGORY_KEY_POINT` (lifecycle events), `LOG_CATEGORY_VENDOR` (vendor status) -- All output goes to `/tmp/task_run.log` inside the container - -## Import Convention - -```python -# Correct (v0.11+) -from ten_runtime import Addon, register_addon_as_extension, TenEnv - -# Wrong (old v0.8.x — will not work) -from ten import Addon -``` - -## Formatting - -- **Black** formatter with `--line-length 80` -- Run: `task format` (from `ai_agents/`) -- Check: `task check` -- Excludes: `third_party/`, `http_server_python/`, `ten_packages/system` - -## Design Principles - -- **YAGNI**: Only implement what is needed now, not what might be needed later -- **KISS**: Prefer simple solutions; three similar lines > premature abstraction -- **No git-ignored files**: Never modify auto-generated files (manifest-lock.json, out/, .ten/, bin/) - -## Related Deep Dives - -- [Extension Development](deep_dives/extension_development.md) — Full creation guide with implementation walkthroughs diff --git a/docs/ai/L1/05_workflows.md b/docs/ai/L1/05_workflows.md deleted file mode 100644 index acdc4b0b76..0000000000 --- a/docs/ai/L1/05_workflows.md +++ /dev/null @@ -1,166 +0,0 @@ -# 05 Workflows - -> Step-by-step guides for common development tasks. - -## Create a New TTS / ASR / LLM Extension - -**Fastest path**: Copy a similar extension and adapt it. - -| Type | Copy From | Base Class | -| ----------- | -------------------------- | --------------------------- | -| TTS (HTTP) | `rime_http_tts` | `AsyncTTS2HttpExtension` | -| TTS (WS) | `deepgram_tts` | `AsyncTTS2BaseExtension` | -| ASR | `deepgram_asr_python` | `AsyncASRBaseExtension` | -| LLM | `openai_llm2_python` | `AsyncLLMBaseExtension` | - -```bash -cp -r agents/ten_packages/extension/deepgram_tts agents/ten_packages/extension/my_vendor_tts -``` - -Then: -1. Rename addon decorator, class names, `manifest.json` `name` field -2. Implement the abstract methods for your vendor API -3. Create `tests/configs/` with required config files (see below) -4. Run guarder tests: `task tts-guarder-test EXTENSION=my_vendor_tts` -5. Run formatter: `task format` - -**Required test config files** for TTS: `property.json`, `property_basic_audio_setting1.json`, -`property_basic_audio_setting2.json`, `property_dump.json`, `property_miss_required.json`, -`property_invalid.json` - -**Required test config files** for ASR: `property_en.json`, `property_zh.json`, -`property_invalid.json`, `property_dump.json` - -For full walkthrough with code and all 15/10 test details, see -[Extension Development](deep_dives/extension_development.md) and [Testing](deep_dives/testing.md). - -## Add Extension to a Graph - -1. **Add node** to `predefined_graphs[].graph.nodes[]` in the example's `tenapp/property.json`: - ```json - {"type": "extension", "name": "my_tts", "addon": "my_tts_python", - "extension_group": "tts_group", - "property": {"api_key": "${env:MY_API_KEY}"}} - ``` - -2. **Add connections** — wire data flow between extensions: - ```json - {"extension": "my_tts", - "data": [{"name": "tts_text_input", "source": [{"extension": "main"}]}], - "audio_frame": [{"name": "pcm_frame", "dest": [{"extension": "agora_rtc"}]}]} - ``` - -3. **Add dependency** to example `tenapp/manifest.json`: - ```json - {"type": "extension", "name": "my_tts_python", "version": "0.1.0"} - ``` - -4. **Install** (use `task install`, not just `tman install` — the latter can wipe `bin/main`): - ```bash - docker exec ten_agent_dev bash -c "cd /app/agents/examples/ && task install" - ``` - -5. **Nuclear restart** (required when graphs are added/removed): - ```bash - sudo docker exec ten_agent_dev bash -c \ - "pkill -9 -f 'bin/api'; pkill -9 -f bun; pkill -9 -f node; pkill -9 -f next-server; pkill -9 -f tman" - sudo docker exec ten_agent_dev bash -c "rm -f /app/playground/.next/dev/lock" - sleep 30 # wait for port 3000 TIME_WAIT to clear - sudo docker exec -d ten_agent_dev bash -c \ - "cd /app/agents/examples/ && task run > /tmp/task_run.log 2>&1" - ``` - -See [Graph Configuration](deep_dives/graph_configuration.md) for connection types and routing patterns. - -**For complex multi-graph setups** (A/B testing vendors, avatar variants), use -`rebuild_property.py` instead of hand-editing. See -[Generating property.json](deep_dives/graph_configuration.md#generating-propertyjson-with-rebuild_propertypy). - -## Customize the Main Extension - -The "main" extension orchestrates agent behavior (greetings, tool routing, interruption). -Three implementation variants exist: - -| Variant | File | Use Case | -| -------------------- | --------------------- | ------------------------------- | -| Python Cascade | `main_python_cascade` | ASR → LLM → TTS pipeline | -| Python Realtime V2V | `main_python_realtime`| OpenAI Realtime API (voice-to-voice) | -| Node.js Cascade | `main_nodejs_cascade` | TypeScript implementation | - -Modify `on_data()` to change event routing, `on_cmd()` for tool handling. - -## Run Tests - -```bash -# All tests -docker exec ten_agent_dev bash -c "cd /app && task test" - -# Single extension (with dependency install) -docker exec ten_agent_dev bash -c \ - "cd /app && task test-extension EXTENSION=agents/ten_packages/extension/deepgram_tts" - -# Single extension (skip install — faster) -docker exec ten_agent_dev bash -c \ - "cd /app && task test-extension-no-install EXTENSION=agents/ten_packages/extension/deepgram_tts" - -# ASR guarder integration tests -docker exec ten_agent_dev bash -c \ - "cd /app && task asr-guarder-test EXTENSION=azure_asr_python" - -# TTS guarder integration tests -docker exec ten_agent_dev bash -c \ - "cd /app && task tts-guarder-test EXTENSION=deepgram_tts" -``` - -See [Testing](deep_dives/testing.md) for test structure and debugging. - -## Restart After Changes - -| What Changed | Action | -| ------------------------------- | ---------------------------------------------------- | -| `property.json` (graphs added) | Nuclear restart (kill all, remove lock, task run) | -| `property.json` (config only) | No restart needed (loaded per session) | -| `.env` | `docker compose down && docker compose up -d` + deps | -| Python code | Restart server only | -| Go code | `task install` then restart server | -| Container restart | Reinstall Python deps, then `task run` | - -## Build and Install - -```bash -# Full install (first time or after adding extensions) — ALWAYS prefer this -docker exec ten_agent_dev bash -c \ - "cd /app/agents/examples/ && task install" - -# Install Python deps only -docker exec ten_agent_dev bash -c \ - "cd /app/agents/examples//tenapp && bash scripts/install_python_deps.sh" - -# Install extension dependencies only (creates symlinks) — WARNING: can wipe bin/main -docker exec ten_agent_dev bash -c \ - "cd /app/agents/examples//tenapp && tman install" -``` - -## Update Extension Code in Running Container - -See [Operations and Restarts](deep_dives/operations_restarts.md) for the full procedure -including `docker cp` syntax, symlink verification, and restart steps. - -## Pre-Commit Checks - -```bash -# Format Python code (Black, line-length 80) -docker exec ten_agent_dev bash -c "cd /app && task format" - -# Check formatting without modifying -docker exec ten_agent_dev bash -c "cd /app && task check" -``` - -Pre-commit hooks validate: API key patterns, Black formatting, conventional commit messages. - -## Related Deep Dives - -- [Extension Development](deep_dives/extension_development.md) — Full extension creation with code examples -- [Graph Configuration](deep_dives/graph_configuration.md) — Connection wiring and routing patterns -- [Testing](deep_dives/testing.md) — Test infrastructure, guarder tests, debugging -- [Operations and Restarts](deep_dives/operations_restarts.md) — Full restart procedures, recovery diff --git a/docs/ai/L1/06_interfaces.md b/docs/ai/L1/06_interfaces.md deleted file mode 100644 index d87b91514f..0000000000 --- a/docs/ai/L1/06_interfaces.md +++ /dev/null @@ -1,150 +0,0 @@ -# 06 Interfaces - -> REST API contracts, graph connection schemas, and base class abstract methods. - -## REST API Endpoints - -The Go server (`server/internal/http_server.go`) exposes: - -| Endpoint | Method | Purpose | Key Fields | -| -------------------- | ------ | ------------------------------------ | --------------------------------- | -| `/health` | GET | Health check | Returns `{"code":"0"}` | -| `/graphs` | GET | List available graphs | Returns `data[].name` | -| `/start` | POST | Start agent session | `graph_name`, `channel_name` | -| `/stop` | POST | Stop agent session | `channel_name` | -| `/ping` | POST | Keep session alive | `channel_name` | -| `/list` | GET | List active sessions | Returns worker list | -| `/token/generate` | POST | Generate Agora RTC token | `channel_name`, `uid` | - -### POST /start Request Body - -```json -{ - "request_id": "uuid", - "channel_name": "test_channel", - "user_uid": 176573, - "graph_name": "voice_assistant", - "properties": { - "openai_llm2_python": {"model": "gpt-4o-mini"} - }, - "timeout": 60 -} -``` - -- `properties` — per-extension overrides merged into graph node properties -- `timeout` — seconds of inactivity before auto-stop (-1 = never) - -## Graph Connection Types - -Connections in `property.json` define data flow between extensions: - -### Command Connections (`cmd`) - -```json -{"extension": "main", "cmd": [ - {"name": "tool_register", "dest": [{"extension": "llm"}]}, - {"name": "on_user_joined", "source": [{"extension": "agora_rtc"}]} -]} -``` - -Common commands: `tool_register`, `on_user_joined`, `flush`, `chat_completion_call`, -`update_configs` - -### Data Connections (`data`) - -```json -{"extension": "llm", "data": [ - {"name": "text_data", "source": [{"extension": "main"}]}, - {"name": "text_data", "dest": [{"extension": "tts"}]} -]} -``` - -Common data: `asr_result`, `text_data`, `tts_text_input`, `tts_audio_start`, -`tts_audio_end`, `error` - -### Audio Frame Connections (`audio_frame`) - -```json -{"extension": "agora_rtc", "audio_frame": [ - {"name": "pcm_frame", "dest": [{"extension": "stt"}]} -]} -``` - -### Video Frame Connections (`video_frame`) - -```json -{"extension": "agora_rtc", "video_frame": [ - {"name": "video_frame", "dest": [{"extension": "vision"}]} -]} -``` - -## Base Class Abstract Methods - -### ASR (`AsyncASRBaseExtension`) - -| Method | Returns | Purpose | -| --------------------------- | --------- | -------------------------------- | -| `vendor()` | `str` | Vendor name (e.g., "deepgram") | -| `start_connection()` | `None` | Connect to ASR service | -| `stop_connection()` | `None` | Disconnect | -| `send_audio(frame)` | `bool` | Send audio frame to service | -| `finalize()` | `None` | Drain pending audio | -| `is_connected()` | `bool` | Connection status check | -| `input_audio_sample_rate()` | `int` | Expected sample rate (e.g., 16000)| - -**Output helpers**: `send_asr_result()`, `send_asr_error()`, `send_asr_finalize_end()`, -`send_connect_delay_metrics()`, `send_vendor_metrics()` - -### TTS (`AsyncTTS2BaseExtension`) - -| Method | Returns | Purpose | -| ------------------------------- | -------- | ------------------------------------ | -| `vendor()` | `str` | Vendor name (e.g., "elevenlabs") | -| `request_tts(tts_text_input)` | `AsyncIterator` | Generate audio from text | -| `cancel_tts()` | `None` | Handle flush/cancellation | -| `synthesize_audio_sample_rate()`| `int` | Output sample rate (e.g., 24000) | -| `synthesize_audio_channels()` | `int` | Channel count (default: 1) | -| `synthesize_audio_sample_width()`| `int` | Bytes per sample (default: 2) | - -**Output helpers**: `send_tts_audio_data()`, `send_tts_audio_start()`, `send_tts_audio_end()`, -`send_tts_error()`, `send_tts_ttfb_metrics()`, `send_tts_text_result()` - -**State machine**: QUEUED → PROCESSING → FINALIZING → COMPLETED (per request) - -### LLM (`AsyncLLMBaseExtension`) - -| Method | Returns | Purpose | -| ------------------------------- | ------- | -------------------------------- | -| `on_call_chat_completion()` | varies | Handle sync command requests | -| `on_data_chat_completion()` | varies | Handle stream-based data input | -| `on_tools_update(tool_metadata)`| `None` | Handle new tool registration | - -**Tool flow**: Extensions register tools via `CMD_TOOL_REGISTER` → LLM stores in -`available_tools` → LLM calls tools during completion → results returned. - -## Manifest API Interface - -Extensions declare their API interface in `manifest.json`: - -```json -{ - "api": { - "interface": [ - {"import_uri": "../../system/ten_ai_base/api/tts-interface.json"} - ], - "property": { - "api_key": {"type": "string"}, - "model": {"type": "string"}, - "sample_rate": {"type": "int32"} - } - } -} -``` - -Interface JSON files define the standard cmd/data/audio_frame schemas for each extension type. - -## Related Deep Dives - -- [Extension Development](deep_dives/extension_development.md) — Implementing abstract methods -- [Server Architecture](deep_dives/server_architecture.md) — Endpoint handlers and property injection -- [Graph Configuration](deep_dives/graph_configuration.md) — Full connection wiring examples diff --git a/docs/ai/L1/07_gotchas.md b/docs/ai/L1/07_gotchas.md deleted file mode 100644 index 15251d5f57..0000000000 --- a/docs/ai/L1/07_gotchas.md +++ /dev/null @@ -1,117 +0,0 @@ -# 07 Gotchas - -> Critical pitfalls, tribal knowledge, and troubleshooting. - -## CRITICAL: Property Getters Return Tuples - -All `get_property_*()` methods return `(value, error_or_none)`, not the raw value. - -```python -# WRONG — causes TypeError -threshold = await ten_env.get_property_float("threshold") -if threshold > 0.5: # TypeError: '>' not supported between 'float' and 'tuple' - -# CORRECT — extract from tuple -threshold_result = await ten_env.get_property_float("threshold") -threshold = threshold_result[0] if isinstance(threshold_result, tuple) else threshold_result -``` - -This applies to `get_property_string()`, `get_property_int()`, `get_property_float()`, -`get_property_bool()`. Always extract `[0]`. - -## CRITICAL: Signal Handlers Forbidden - -Extensions run in worker threads. Signal handlers only work in the main thread. - -```python -# WRONG — raises ValueError: signal only works in main thread -signal.signal(signal.SIGTERM, handler) -atexit.register(cleanup) - -# CORRECT — use extension lifecycle -async def on_stop(self, ten_env): - await self.cleanup() -``` - -## CRITICAL: Always Use `task run` - -Never start the server with `./bin/api` or `./bin/main` directly. -`task run` sets the correct PYTHONPATH and starts all services together -(API server + playground + TMAN Designer). - -## Zombie Worker Processes - -Worker processes (`bin/main`) can survive container and server restarts. -Always check for and kill zombies before restarting. - -## .env Loaded at Container Startup Only - -Editing `.env` while the container is running has **no effect**. You must -`docker compose down && docker compose up -d`, then reinstall Python deps. - -## Next.js Lock File - -After crashes, `.next/dev/lock` becomes stale, preventing restart. Delete it -and do a full restart. See [Operations and Restarts](deep_dives/operations_restarts.md). - -## Python Deps Not Persisted - -Python dependencies are lost on container restart. Always reinstall after -`docker compose down && up`. - -## tman install Can Wipe bin/main - -Running `tman install` when system dependencies have newer versions replaces -the runtime packages and **deletes `bin/main`**. Use `task install` (full -rebuild) instead of bare `tman install`. Signs: Worker fails with -`bin/main: No such file or directory` in logs. - -## tman Install Creates Symlinks - -Never manually `ln -s` for extensions. Use `tman install` which resolves -dependencies and creates correct links. If a symlink is missing after -`tman install`, create it manually as a fallback. - -## docker cp Creates Nested Directories - -When using `docker cp` to update extension code, trailing slashes create -nested directories. Use `docker cp ./ext/. container:/path/ext/` syntax. -Signs: `ModuleNotFoundError: No module named 'ten_packages.extension.X'`. - -## Audio Routing: Split at Source Only - -When routing audio to multiple destinations, the split must happen at the -source node (e.g., `agora_rtc`), not at intermediate nodes. Splitting from -intermediate nodes can cause crashes. - -## Frontend Caches Graph List - -The playground caches the `/graphs` API response. When adding or removing -graphs from `property.json`, a full restart is required — simple server -restart is not enough. - -## Manifest Module Name Must Match - -The `name` field in extension `manifest.json` must exactly match the `addon` -field used in graph nodes in `property.json`. Mismatches cause silent failures. - -## next-server Holds Port 3000 - -Killing `node` and `bun` is not enough — `next-server` is a separate process -that holds port 3000. If port 3000 is occupied, Next.js silently starts on -3001+ which isn't Docker-exposed, making the frontend appear down. - -## Apple Silicon Docker - -Docker containers may need Rosetta for x86 images on Apple Silicon Macs. -Enable in Docker Desktop: Settings > General > Use Rosetta. - -## Windows Line Endings - -Before cloning on Windows: `git config --global core.autocrlf false` - -## Related Deep Dives - -- [Operations and Restarts](deep_dives/operations_restarts.md) — Full restart procedures, port debugging, recovery -- [Deployment](deep_dives/deployment.md) — Production setup, persistent startup -- [Server Architecture](deep_dives/server_architecture.md) — Worker lifecycle, session management diff --git a/docs/ai/L1/08_security.md b/docs/ai/L1/08_security.md deleted file mode 100644 index 5efef433c8..0000000000 --- a/docs/ai/L1/08_security.md +++ /dev/null @@ -1,88 +0,0 @@ -# 08 Security - -> Secret management, input validation, and repository hygiene. - -## API Key Management - -- **Single source**: All API keys live in `ai_agents/.env` (git-ignored) -- **Never hardcode** keys in `property.json` — use `${env:VAR_NAME}` substitution -- **Persistent storage**: Keep a copy of keys outside the repo (e.g., `~/api_keys.txt`) - so branch switches don't lose them -- See `.env.example` for the complete variable catalog - -## Environment Variable Substitution - -In `property.json`, reference secrets via: - -```json -{ - "api_key": "${env:DEEPGRAM_API_KEY}", - "region": "${env:AZURE_REGION|eastus}" -} -``` - -| Syntax | Behavior | -| ------------------------- | ---------------------------- | -| `${env:VAR}` | Required — error if missing | -| `${env:VAR\|}` | Optional — empty if missing | -| `${env:VAR\|default}` | Optional — default if missing| - -## Sensitive Data in Logs - -Extensions must encrypt sensitive fields before logging: - -```python -def to_str(self, sensitive_handling: bool = True) -> str: - config = copy.deepcopy(self) - if config.params and "api_key" in config.params: - config.params["api_key"] = utils.encrypt(config.params["api_key"]) - return f"{config}" -``` - -Never log raw API keys, tokens, or credentials. - -## Server-Side Protections - -The Go server (`http_server.go`) implements: - -- **Path traversal prevention**: Ignores client-requested `tenapp_dir`, always uses - the launch-configured directory -- **Channel name sanitization**: Validated before use in file operations -- **Safe type conversion**: Property values are type-checked during merge -- **Recursive property merge**: Prevents injection via nested config overrides - -## Pre-Commit Hooks - -| Hook | What It Checks | -| ------------- | ----------------------------------------------------------- | -| `pre-commit` | Scans staged files for API key patterns (`API_KEY.*=[A-Za-z0-9]{20,}`) | -| `pre-commit` | Black formatting compliance (line-length 80) | -| `commit-msg` | Conventional commit format, blocks AI tool name references | - -## Git-Ignored Files - -These are auto-generated — never modify or commit them: - -| Pattern | Source | -| ---------------------- | ------------------------- | -| `manifest-lock.json` | `tman` dependency resolve | -| `compile_commands.json`| Build system | -| `BUILD.gn`, `.gn` | Build configuration | -| `out/`, `build/` | Build output | -| `.ten/` | TEN runtime files | -| `bin/main`, `bin/worker`| Compiled binaries | -| `.release/` | Release packaging | -| `node_modules/` | JS dependencies | -| `.env` | Environment secrets | - -## Files That Should Never Be Committed - -- `.env` (API keys and secrets) -- `*.pem` (certificates) -- `*.pcm` (audio dumps) -- Credential files, tokens, session data - -## Related Deep Dives - -- [Deployment](deep_dives/deployment.md) — Production security considerations -- [Server Architecture](deep_dives/server_architecture.md) — Server-side validation details diff --git a/docs/ai/L1/deep_dives/_index.md b/docs/ai/L1/deep_dives/_index.md deleted file mode 100644 index 07b5b13b20..0000000000 --- a/docs/ai/L1/deep_dives/_index.md +++ /dev/null @@ -1,10 +0,0 @@ -# Deep Dives Index - -| Document | Summary | Load When | -| -------------------------------------------------------- | ------------------------------------------------ | ------------------------------------------------ | -| [extension_development.md](extension_development.md) | Full extension creation guide, base classes, test configs, pre-submission checklist | Creating a new TTS/ASR/LLM extension | -| [graph_configuration.md](graph_configuration.md) | Graph nodes, connections, routing, property.json | Modifying graphs or wiring extensions together | -| [testing.md](testing.md) | All 15 TTS + 10 ASR guarder tests, pass criteria, config files, debugging | Running or debugging tests for an extension | -| [deployment.md](deployment.md) | Docker, Cloudflare, Nginx, Grafana monitoring | Deploying to production or setting up monitoring | -| [server_architecture.md](server_architecture.md) | Go server, property injection, worker lifecycle | Understanding server internals or debugging | -| [operations_restarts.md](operations_restarts.md) | Full restart procedures, port debugging, recovery| Restarting services, crash recovery, port conflicts| diff --git a/docs/ai/L1/deep_dives/deployment.md b/docs/ai/L1/deep_dives/deployment.md deleted file mode 100644 index 3ffae48c77..0000000000 --- a/docs/ai/L1/deep_dives/deployment.md +++ /dev/null @@ -1,206 +0,0 @@ -# Deployment - -> **When to Read This:** Load this document when you are deploying to production, -> setting up HTTPS access, configuring monitoring, or ensuring services persist -> across session closures. - -## Docker Compose Setup - -The development container is defined in `ai_agents/docker-compose.yml`: - -```yaml -services: - ten_agent_dev: - image: ghcr.io/ten-framework/ten_agent_build:0.7.14 - container_name: ten_agent_dev - ports: - - "49483:49483" # TMAN Designer - - "3000:3000" # Playground - - "8000-9001:8000-9001" # API + worker range - volumes: - - .:/app - environment: - - LOG_PATH=${LOG_PATH} -``` - -Start: `cd ai_agents && docker compose up -d` - -## Persistent Startup (Survives Session Closure) - -Use `-d` flag with `docker exec` to keep services running after terminal disconnect: - -```bash -# 1. Clean up existing processes -sudo docker exec ten_agent_dev bash -c "pkill -9 -f 'bin/api'; pkill -9 node; pkill -9 bun" -ps -elf | grep 'bin/main' | grep -v grep | awk '{print $4}' | xargs -r sudo kill -9 2>/dev/null - -# 2. Remove stale lock files -sudo docker exec ten_agent_dev bash -c "rm -f /app/playground/.next/dev/lock" - -# 3. Install Python dependencies -sudo docker exec ten_agent_dev bash -c \ - "cd /app/agents/examples/voice-assistant-advanced/tenapp && bash scripts/install_python_deps.sh" - -# 4. Start everything in detached mode -sudo docker exec -d ten_agent_dev bash -c \ - "cd /app/agents/examples/voice-assistant-advanced && task run > /tmp/task_run.log 2>&1" - -# 5. Wait and verify -sleep 15 -curl -s http://localhost:8080/health && echo " API ready" -curl -s http://localhost:8080/graphs | jq -r '.data | length' | xargs echo "Graphs:" -curl -s http://localhost:3000 -o /dev/null -w '%{http_code}' | xargs echo "Playground:" -``` - -Key: `-d` flag keeps processes running. `task run` starts API + playground + TMAN Designer. - -## Cloudflare Tunnel (Free HTTPS) - -Quick HTTPS access without domain or SSL setup: - -```bash -# Start tunnel -pkill cloudflared -nohup cloudflared tunnel --url http://localhost:3000 > /tmp/cloudflare_tunnel.log 2>&1 & -sleep 5 - -# Get the random URL -grep -o 'https://[^[:space:]]*\.trycloudflare\.com' /tmp/cloudflare_tunnel.log | head -1 -# Example: https://films-colon-msgid-incentives.trycloudflare.com -``` - -- Free tunnels get **random URLs** that change on restart -- No DNS configuration needed -- Good for development and demos - -## Nginx Reverse Proxy (Production HTTPS) - -For production with custom domain and SSL certificates: - -```nginx -server { - listen [::]:453 ssl ipv6only=on; - listen 453 ssl; - ssl_certificate /etc/letsencrypt/live/oai.agora.io/fullchain.pem; - ssl_certificate_key /etc/letsencrypt/live/oai.agora.io/privkey.pem; - include /etc/letsencrypt/options-ssl-nginx.conf; - ssl_dhparam /etc/letsencrypt/ssl-dhparams.pem; - - # API endpoints - location ~ ^/(health|ping|token|start|stop|graphs|list)(/|$) { - proxy_pass http://localhost:8080; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - } - - # Playground (with WebSocket upgrade) - location / { - proxy_pass http://localhost:3000; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_http_version 1.1; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection "upgrade"; - } -} -``` - -Apply: `sudo nginx -t && sudo systemctl reload nginx` - -## Production Build - -```bash -# Build optimized frontend -docker exec ten_agent_dev bash -c "cd /app/playground && npm run build" - -# Start production server -docker exec -d ten_agent_dev bash -c \ - "cd /app/playground && npm start > /tmp/playground_prod.log 2>&1" -``` - -## Grafana Monitoring - -Located in `tools/grafana-monitoring/`. Three deployment modes: - -### Pull Mode (Development) - -Prometheus scrapes a metrics endpoint exposed by the TEN runtime: - -```json -// In property.json -{ - "ten": { - "exporter": { - "enabled": true, - "type": "prometheus", - "prometheus": { - "listen_address": "0.0.0.0", - "listen_port": 49484 - } - } - } -} -``` - -Setup: `cd tools/grafana-monitoring && docker compose -f docker-compose.pull.yml up -d` - -### Push Mode (Production) - -Uses OTEL Collector to push metrics to Prometheus and logs to Loki: - -```json -// In property.json -{ - "ten": { - "exporter": { - "enabled": true, - "type": "otlp", - "otlp": { - "endpoint": "http://otel-collector:4317" - } - } - } -} -``` - -Setup: `cd tools/grafana-monitoring && docker compose -f docker-compose.push.yml up -d` - -### Hybrid Mode - -Both Pull and Push simultaneously — useful for A/B testing or migration. - -### Monitored Metrics - -| Metric | Good Threshold | What It Measures | -| --------------------------------- | -------------- | ----------------------------------- | -| Extension Lifecycle Duration | < 1 second | on_configure, on_init, on_start, on_stop, on_deinit | -| Extension CMD Processing Duration | < 100ms | P50/P95 command handling time | -| Thread Message Queue Wait Time | < 50ms | Time messages wait before processing| - -### Log Aggregation (Push Mode Only) - -Push mode sends logs to Loki for centralized querying: - -``` -# LogQL query examples -{service_name="ten_agent"} |= "error" -{service_name="ten_agent"} | json | level="error" -{service_name="ten_agent"} |= "deepgram" | json -``` - -## After Container Restart Checklist - -1. Reinstall Python dependencies (not persisted) -2. Start server with `task run` -3. Restart Cloudflare tunnel (if using) -4. Kill any zombie worker processes on host -5. Verify with `/health` and `/graphs` endpoints - -## See Also - -- [Back to Setup](../01_setup.md) -- [Server Architecture](server_architecture.md) — Worker lifecycle, session management diff --git a/docs/ai/L1/deep_dives/extension_development.md b/docs/ai/L1/deep_dives/extension_development.md deleted file mode 100644 index ba8588742d..0000000000 --- a/docs/ai/L1/deep_dives/extension_development.md +++ /dev/null @@ -1,653 +0,0 @@ -# Extension Development - -> **When to Read This:** Load this document when you are creating a new TTS, ASR, or LLM -> extension. It covers the exact files to create, base classes to inherit, abstract methods -> to implement, test configs to provide, and guarder tests your extension must pass. - -## Quick Start: Copy an Existing Extension - -The fastest way to create a new extension is to copy a similar one: - -| Extension Type | Good Template to Copy | Base Class | -| -------------- | ------------------------------- | --------------------------- | -| TTS (HTTP) | `rime_http_tts` | `AsyncTTS2HttpExtension` | -| TTS (WebSocket)| `deepgram_tts` | `AsyncTTS2BaseExtension` | -| ASR (WebSocket)| `deepgram_asr_python` | `AsyncASRBaseExtension` | -| LLM | `openai_llm2_python` | `AsyncLLMBaseExtension` | -| LLM Tool | `bingsearch_tool_python` | `AsyncLLMToolBaseExtension` | - -```bash -cp -r agents/ten_packages/extension/deepgram_tts agents/ten_packages/extension/my_vendor_tts -# Then rename: addon decorator, class names, manifest.json name field -``` - -## Directory Structure - -``` -my_vendor_tts_python/ -├── __init__.py # Can be empty -├── addon.py # Registration (MUST match manifest.json name) -├── extension.py # Main logic OR orchestration -├── my_vendor_tts.py # Vendor client (websocket/http logic) -├── config.py # Pydantic config model -├── manifest.json # Metadata + API interface + property schema -├── property.json # Defaults with ${env:VAR} syntax -├── requirements.txt # Python deps -├── README.md # Usage docs -└── tests/ - ├── bin/ - │ └── start # Test entry script (sets PYTHONPATH, runs pytest) - └── configs/ - ├── property.json # Default test config - ├── property_basic_audio_setting1.json # Sample rate test 1 (e.g. 16000) - ├── property_basic_audio_setting2.json # Sample rate test 2 (e.g. 24000) - ├── property_dump.json # Audio dump test config - ├── property_miss_required.json # Missing API key test - └── property_invalid.json # Invalid API key test -``` - -## Step 1: addon.py - -```python -from ten_runtime import Addon, register_addon_as_extension, TenEnv - -@register_addon_as_extension("my_vendor_tts_python") -class MyVendorTTSAddon(Addon): - def on_create_instance(self, ten: TenEnv, addon_name: str, context) -> None: - from .extension import MyVendorTTSExtension - ten.on_create_instance_done(MyVendorTTSExtension(addon_name), context) -``` - -The decorator name **must exactly match** `manifest.json` `name` field AND the `addon` -field in graph nodes. - -## Step 2: config.py - -```python -from pydantic import BaseModel, Field -from typing import Any -import copy -from ten_ai_base import utils - -class MyVendorTTSConfig(BaseModel): - api_key: str = "" - model: str = "default-model" - sample_rate: int = 24000 - dump: bool = False - dump_path: str = "" - params: dict[str, Any] = Field(default_factory=dict) - - def validate(self) -> None: - key = self.api_key or self.params.get("api_key", "") - if not key: - raise ValueError("API key is required") - - def to_str(self, sensitive_handling: bool = True) -> str: - if not sensitive_handling: - return f"{self}" - config = copy.deepcopy(self) - if config.params and "api_key" in config.params: - config.params["api_key"] = utils.encrypt(config.params["api_key"]) - return f"{config}" -``` - -## Step 3: manifest.json - -```json -{ - "type": "extension", - "name": "my_vendor_tts_python", - "version": "0.1.0", - "dependencies": [ - {"type": "system", "name": "ten_runtime_python", "version": "0.8"} - ], - "api": { - "interface": [ - {"import_uri": "../../system/ten_ai_base/api/tts-interface.json"} - ], - "property": { - "api_key": {"type": "string"}, - "model": {"type": "string"}, - "sample_rate": {"type": "int32"}, - "dump": {"type": "bool"}, - "dump_path": {"type": "string"}, - "params": {"type": "object"} - } - } -} -``` - -Use `tts-interface.json` for TTS, `asr-interface.json` for ASR, `llm-interface.json` for LLM. - -## Step 4: property.json - -```json -{ - "api_key": "${env:MY_VENDOR_API_KEY}", - "model": "default-model", - "sample_rate": 24000 -} -``` - -## Step 5: extension.py — Implementing the Base Class - -### TTS Extension (WebSocket Mode) - -```python -from ten_ai_base.tts2 import AsyncTTS2BaseExtension - -class MyVendorTTSExtension(AsyncTTS2BaseExtension): - def vendor(self) -> str: - return "my_vendor" - - async def on_init(self, ten_env) -> None: - await super().on_init(ten_env) - config_json, _ = await ten_env.get_property_to_json("") - self.config = MyVendorTTSConfig(**json.loads(config_json)) - self.config.validate() - - async def on_start(self, ten_env) -> None: - await super().on_start(ten_env) - self.client = MyVendorTTSClient(self.config, ten_env) - await self.client.connect() - - async def on_stop(self, ten_env) -> None: - await super().on_stop(ten_env) - await self.client.close() - - async def request_tts(self, tts_text_input) -> AsyncIterator[tuple[bytes, int | None]]: - text = tts_text_input.get_text() - request_id = tts_text_input.get_request_id() - async for audio_chunk in self.client.synthesize(text, request_id): - yield audio_chunk, None # (bytes, event_status) - - async def cancel_tts(self) -> None: - await self.client.cancel() - - def synthesize_audio_sample_rate(self) -> int: - return self.config.sample_rate - - def synthesize_audio_channels(self) -> int: - return 1 # mono - - def synthesize_audio_sample_width(self) -> int: - return 2 # 16-bit -``` - -**TTS2 state machine**: The base class manages request states automatically: -QUEUED -> PROCESSING -> FINALIZING -> COMPLETED. Your `request_tts()` just yields audio bytes. - -**Output events sent automatically** by the base class: -- `tts_audio_start` — when first audio chunk is ready -- `pcm_frame` — for each audio chunk -- `tts_audio_end` — when request completes -- `tts_error` — on failure - -### TTS Extension (HTTP Mode) - -Simpler — for non-streaming HTTP APIs: - -```python -from ten_ai_base.tts2_http import AsyncTTS2HttpExtension - -class MyVendorTTSExtension(AsyncTTS2HttpExtension): - def vendor(self) -> str: - return "my_vendor" - - async def request_tts(self, text: str, request_id: str) -> AsyncIterator[bytes]: - async with httpx.AsyncClient() as client: - async with client.stream("POST", self.url, json={"text": text}) as resp: - async for chunk in resp.aiter_bytes(): - yield chunk - - def synthesize_audio_sample_rate(self) -> int: - return self.config.sample_rate -``` - -### ASR Extension - -```python -from ten_ai_base.asr import AsyncASRBaseExtension - -class MyVendorASRExtension(AsyncASRBaseExtension): - def vendor(self) -> str: - return "my_vendor" - - async def start_connection(self) -> None: - self.ws = await websockets.connect(self.url, headers=self.auth_headers) - # Start a listener task for results - asyncio.create_task(self._listen_for_results()) - - async def stop_connection(self) -> None: - if self.ws: - await self.ws.close() - self.ws = None - - async def send_audio(self, frame) -> bool: - buf = frame.lock_buf() - data = bytes(buf) - frame.unlock_buf(buf) - await self.ws.send(data) - return True - - async def finalize(self) -> None: - await self.ws.send(json.dumps({"type": "CloseStream"})) - # Wait for final results before returning - - def is_connected(self) -> bool: - return self.ws is not None and self.ws.open - - def input_audio_sample_rate(self) -> int: - return 16000 - - async def _listen_for_results(self): - async for msg in self.ws: - result = json.loads(msg) - if result.get("is_final"): - asr_result = ASRResult(text=result["transcript"], language="en-US", ...) - await self.send_asr_result(asr_result) -``` - -**ASR output methods** you must call: -- `await self.send_asr_result(asr_result)` — for each transcription -- `await self.send_asr_error(error, vendor_info)` — on vendor errors -- `await self.send_asr_finalize_end()` — when finalize completes - -**Buffer strategy**: Override `buffer_strategy()` to return `ASRBufferConfigModeKeep` -if you want audio buffered during disconnects (default discards). - -### LLM Extension - -```python -from ten_ai_base.llm import AsyncLLMBaseExtension - -class MyLLMExtension(AsyncLLMBaseExtension): - async def on_call_chat_completion(self, ten_env, **kwargs): - # Handle command-based chat requests - pass - - async def on_data_chat_completion(self, ten_env, **kwargs): - # Handle stream-based data input - pass - - async def on_tools_update(self, ten_env, tool_metadata): - async with self._available_tools_lock: - self.available_tools = tool_metadata -``` - ---- - -## TTS Audio Pipeline: Data Types and Flow - -Understanding the data types is critical for implementing TTS extensions correctly. - -### Data Flow Through the Pipeline - -``` -User speaks → Agora RTC → pcm_frame → ASR → asr_result → main_control - → text_data → LLM → text_data → main_control → tts_text_input → TTS - → pcm_frame → Agora RTC → User hears -``` - -### tts_text_input (incoming to your extension) - -```python -class TTSTextInput: - request_id: str # Unique request identifier - text: str # Text chunk to synthesize - text_input_end: bool # True = last chunk for this request_id - metadata: dict # Context: {session_id, turn_id, ...} -``` - -- Multiple `tts_text_input` messages can share one `request_id` (the "append" pattern) -- `text_input_end=True` signals no more text is coming for this request -- The base class handles queuing and buffering — your `request_tts()` receives complete inputs - -### tts_audio_start / tts_audio_end (outgoing from your extension) - -These are sent automatically by the base class. You don't need to send them manually. - -```json -// tts_audio_start -{"request_id": "req1", "metadata": {"session_id": "sess1", "turn_id": 1}} - -// tts_audio_end -{ - "request_id": "req1", - "request_event_interval_ms": 1500, - "request_total_audio_duration_ms": 3200, - "reason": 1, - "metadata": {"session_id": "sess1", "turn_id": 1} -} -``` - -**Reason values**: `REQUEST_END` (1) = normal completion, `INTERRUPTED` (2) = flush/cancel, -`ERROR` (3) = failure. - -### tts_flush / tts_flush_end - -Flush is triggered when the user interrupts (speaks while TTS is playing). - -```json -// tts_flush (incoming signal) -{"flush_id": "flush_abc123", "metadata": {"session_id": "sess1"}} - -// tts_flush_end (your extension's response — sent automatically by base class) -{"flush_id": "flush_abc123", "metadata": {"session_id": "sess1"}} -``` - -**Critical**: `flush_id` and `metadata` must be echoed back exactly. - -## Flush Handling in TTS Extensions - -The base class (`AsyncTTS2BaseExtension`) handles most flush logic automatically. -Your extension only needs to implement `cancel_tts()`: - -```python -async def cancel_tts(self) -> None: - """Called when a flush signal arrives. Stop any in-progress synthesis.""" - if self.client: - await self.client.cancel() -``` - -### What the Base Class Does on Flush - -1. Acquires `_put_lock` to block new `tts_text_input` arrivals -2. Clears `_flush_complete_event` to prevent race conditions -3. Flushes the internal queue (discards all pending items) -4. Calls `cancel_tts()` on your extension (you stop the vendor API) -5. Sends `tts_audio_end` with `reason=INTERRUPTED` for the current request -6. Sends `tts_flush_end` with the echoed `flush_id` and `metadata` -7. Resets all request state (ready for next request) -8. Sets `_flush_complete_event` to re-enable queue processing - -### Request Interleaving (How Buffering Works) - -When multiple requests arrive with different `request_id`s: - -1. First request is processed immediately (`_processing_request_id = "req1"`) -2. Messages for other request_ids are **buffered** in `_pending_messages` -3. When req1 completes, the next buffered request is released (FIFO order) -4. Each request maintains strict event ordering: `audio_start → frames → audio_end` - -Your `request_tts()` doesn't need to handle interleaving — the base class does it. - -## The Three property.json Files - -There are three distinct `property.json` files with different roles: - -### 1. Extension Defaults (`agents/ten_packages/extension//property.json`) - -Default config for the extension. Loaded when no overrides are specified: - -```json -{ - "api_key": "${env:MY_VENDOR_API_KEY}", - "model": "default-model", - "sample_rate": 24000 -} -``` - -### 2. App Graph Definition (`agents/examples//tenapp/property.json`) - -Defines the complete agent — nodes, connections, per-instance overrides: - -```json -{ - "ten": { - "predefined_graphs": [{ - "name": "voice_assistant", - "graph": { - "nodes": [ - {"name": "tts", "addon": "my_vendor_tts_python", - "property": {"model": "high-quality", "sample_rate": 24000}} - ], - "connections": [...] - } - }] - } -} -``` - -Properties here **override** extension defaults for this specific graph instance. - -### 3. Test Configs (`agents/ten_packages/extension//tests/configs/*.json`) - -Used by guarder tests. Each test loads a specific config file: - -```json -{ - "dump": true, - "dump_path": "./tests/dump_output/", - "params": {"key": "${env:MY_VENDOR_API_KEY}", "sample_rate": 16000} -} -``` - -**Loading order**: Extension defaults → App graph overrides → Test config overrides. - ---- - -## Step 6: Test Configuration Files - -Your extension's `tests/configs/` directory needs these config files for the guarder tests to work: - -### For TTS Extensions - -| Config File | Purpose | Content | -| ------------------------------------ | -------------------------------------- | -------------------------------------- | -| `property.json` | Default test config | Valid API key, default model/settings | -| `property_basic_audio_setting1.json` | Sample rate test 1 | `sample_rate: 16000` + valid key | -| `property_basic_audio_setting2.json` | Sample rate test 2 | `sample_rate: 24000` + valid key | -| `property_dump.json` | Audio dump test | `dump: true, dump_path: "./tests/dump_output/"` | -| `property_miss_required.json` | Missing params error test | Empty API key | -| `property_invalid.json` | Invalid params error test | Empty or invalid API key | - -**Example `property.json`** (for elevenlabs): -```json -{ - "params": { - "key": "${env:ELEVENLABS_TTS_KEY}", - "model_id": "eleven_turbo_v2_5" - } -} -``` - -**Example `property_basic_audio_setting1.json`**: -```json -{ - "dump": true, - "dump_path": "./tests/keep_dump_output/", - "params": { - "sample_rate": 16000, - "key": "${env:ELEVENLABS_TTS_KEY}" - } -} -``` - -**Example `property_basic_audio_setting2.json`**: -```json -{ - "dump": true, - "dump_path": "./tests/keep_dump_output/", - "params": { - "sample_rate": 24000, - "key": "${env:ELEVENLABS_TTS_KEY}" - } -} -``` - -**Example `property_miss_required.json`**: -```json -{ - "params": {"key": ""} -} -``` - -### For ASR Extensions - -| Config File | Purpose | Content | -| ------------------------ | -------------------------- | ------------------------------------ | -| `property_en.json` | English transcription test | Valid key + `language: "en-US"` | -| `property_zh.json` | Chinese transcription test | Valid key + `language: "zh-CN"` | -| `property_invalid.json` | Error handling test | `key: "invalid", region: "invalid"` | -| `property_dump.json` | Audio dump test | Valid key + `dump: true` | - ---- - -## Step 7: TTS Guarder Tests Your Extension Must Pass - -Run with: `task tts-guarder-test EXTENSION=my_vendor_tts_python` - -There are **15 tests**. Here's what each validates: - -### Must-Pass Tests - -| Test | What It Validates | -| --------------------------------------- | -------------------------------------------------------- | -| `test_append_input` | Multiple text inputs appended with same request_id | -| `test_append_input_stress` | High volume of append operations | -| `test_append_input_without_text_input_end` | Missing text_input_end flags handled gracefully | -| `test_append_interrupt` | New requests interrupting in-progress ones | -| `test_basic_audio_setting` | Different sample rates produce different audio | -| `test_corner_input` | Special chars, emojis, very short/long text | -| `test_dump` | Audio dump files created with valid PCM data | -| `test_dump_each_request_id` | Each request_id produces separate dump file | -| `test_empty_text_request` | Empty/whitespace text: audio_end within 500ms, no crash | -| `test_flush` | Flush signal: receives flush_end, no data after 5s | -| `test_interleaved_requests` | 8 concurrent requests maintain separate audio streams | -| `test_invalid_required_params` | Invalid API key returns FATAL ERROR, no crash | -| `test_invalid_text_handling` | Malformed text handled without crash | -| `test_metrics` | TTFB metrics generated with valid timestamps | -| `test_miss_required_params` | Missing API key returns appropriate error | - -### Critical Pass Criteria - -- **Event ordering**: `tts_audio_start` -> `pcm_frame`(s) -> `tts_audio_end` per request -- **Request isolation**: Interleaved requests must not mix audio streams -- **Error handling**: Invalid/missing configs must produce errors, never crashes -- **Empty text**: Must complete quickly (audio_end within 500ms), no audio generated -- **Flush**: After flush_end, no more data for 5 seconds -- **Dump files**: Valid PCM data, one file per request_id when enabled - -## Step 8: ASR Guarder Tests Your Extension Must Pass - -Run with: `task asr-guarder-test EXTENSION=my_vendor_asr_python` - -There are **10 tests** (1 skipped by default): - -| Test | What It Validates | -| --------------------------- | ------------------------------------------------------------ | -| `test_connection_timing` | Connects and transcribes English audio correctly | -| `test_asr_result` | Result structure: id, text, language, session_id fields | -| `test_asr_finalize` | Finalize signal produces final=True result + finalize_end | -| `test_reconnection` | Recovers gracefully after connection failure | -| `test_vendor_error` | Invalid creds produce proper error with vendor info | -| `test_multi_language` | English (en-US) and Chinese (zh-CN) both transcribe correctly| -| `test_dump` | Audio dump files created correctly | -| `test_metrics` | TTFW and TTLW metrics: positive, TTLW > TTFW | -| `test_audio_timestamp` | start_ms and duration_ms accuracy | -| `test_long_duration_stream` | **Skipped by default** — 5+ min stream without timeout | - -### Critical Pass Criteria - -- **Result fields**: Every ASR result must have `id`, `text`, `language`, `session_id` -- **Finalize**: Must produce `final=True` result and `asr_finalize_end` response -- **Error format**: Errors must have `id`, `module`, `code`, `message` + vendor info -- **Metrics**: TTFW > 0, TTLW > TTFW, both in milliseconds -- **Audio format**: Accepts 16-bit PCM, 16kHz, mono, 320 bytes per frame - ---- - -## AudioFrame Creation Pattern - -```python -from ten_runtime import AudioFrame, AudioFrameDataFmt - -frame = AudioFrame.create("pcm_frame") -frame.set_sample_rate(16000) -frame.set_bytes_per_sample(2) # 16-bit -frame.set_number_of_channels(1) # Mono -frame.set_data_fmt(AudioFrameDataFmt.INTERLEAVE) -frame.set_samples_per_channel(len(pcm_data) // 2) -frame.alloc_buf(len(pcm_data)) -buf = frame.lock_buf() -buf[:] = pcm_data -frame.unlock_buf(buf) -await ten_env.send_audio_frame(frame) -``` - -Set all properties **before** `alloc_buf()`. - -## Params Dict Pattern - -For HTTP/WebSocket vendor APIs: - -1. Store all config including `api_key` in `params` dict -2. Extract `api_key` for auth headers in client constructor -3. Strip `api_key` from params **only when building the HTTP request payload** -4. In `update_params()`: add vendor-required params, normalize keys - -```python -# Client constructor -self.api_key = config.params.get("api_key", "") -self.headers = {"Authorization": f"Bearer {self.api_key}"} - -# Request method -payload = {**self.config.params} -payload.pop("api_key", None) -``` - -## Bidirectional Extension Pattern - -For extensions that both receive from and send to the graph: - -```python -class MyBridge(AsyncExtension): - async def on_init(self, ten_env): - self.ten_env = ten_env # Store for callbacks - - async def on_audio_frame(self, ten_env, audio_frame): - buf = audio_frame.lock_buf() - self.external_system.send(bytes(buf)) - audio_frame.unlock_buf(buf) - - async def _external_callback(self, data): - frame = AudioFrame.create("pcm_frame") - # ... fill frame ... - await self.ten_env.send_audio_frame(frame) -``` - -## Pre-Submission Checklist - -- [ ] `addon.py` decorator name matches `manifest.json` `name` field -- [ ] All abstract methods implemented (vendor, request_tts/send_audio, etc.) -- [ ] Config validation raises ValueError for missing required params -- [ ] `to_str()` encrypts sensitive fields before logging -- [ ] `tests/configs/` has all required config files (see Step 6) -- [ ] `task tts-guarder-test` or `task asr-guarder-test` passes -- [ ] `task format` passes (Black, line-length 80) -- [ ] `task lint-extension EXTENSION=my_vendor_tts_python` passes -- [ ] `requirements.txt` lists all Python dependencies -- [ ] `README.md` documents config properties and env vars -- [ ] No hardcoded API keys anywhere - -## Language-Specific Notes - -| Language | Create Command | -| ---------- | -------------------------------------------------------------------- | -| Python | `tman create extension name --template default_async_extension_python` | -| Go | `tman create extension name --template default_extension_go` | -| C++ | `tman create extension name --template default_extension_cpp` | -| Node.js | `tman create extension name --template default_extension_nodejs` | - -## Portal References (Full Guides) - -- [Create a TTS Extension (89K)](https://github.com/TEN-framework/portal/blob/main/content/docs/ten_agent_examples/extension_dev/create_tts_extension.mdx) [EXTERNAL] -- [Create an ASR Extension (39K)](https://github.com/TEN-framework/portal/blob/main/content/docs/ten_agent_examples/extension_dev/create_asr_extension.mdx) [EXTERNAL] -- [Create a Hello World Extension](https://github.com/TEN-framework/portal/blob/main/content/docs/ten_agent_examples/extension_dev/create_hello_world_extension.mdx) [EXTERNAL] - -## See Also - -- [Back to Conventions](../04_conventions.md) -- [Back to Workflows](../05_workflows.md) -- [Testing](testing.md) — Full guarder test details and debugging diff --git a/docs/ai/L1/deep_dives/graph_configuration.md b/docs/ai/L1/deep_dives/graph_configuration.md deleted file mode 100644 index bfabb78ac8..0000000000 --- a/docs/ai/L1/deep_dives/graph_configuration.md +++ /dev/null @@ -1,410 +0,0 @@ -# Graph Configuration - -> **When to Read This:** Load this document when you are modifying graph definitions -> in property.json, adding extensions to agent pipelines, or debugging data flow issues. - -## Overview - -Graphs define which extensions run and how they communicate. They are declared in -`property.json` under the `predefined_graphs` array within the `ten` namespace. - -## Property.json Structure - -```json -{ - "ten": { - "log": { - "handlers": [...] - }, - "predefined_graphs": [ - { - "name": "voice_assistant", - "auto_start": true, - "graph": { - "nodes": [...], - "connections": [...] - } - } - ] - } -} -``` - -- `name` — graph identifier, used in `/start` request's `graph_name` field -- `auto_start` — set to `true` by the server for the selected graph at runtime -- `graph.nodes` — extension instances -- `graph.connections` — data flow wiring - -## Node Schema - -```json -{ - "type": "extension", - "name": "stt", - "addon": "deepgram_asr_python", - "extension_group": "transcription_group", - "property": { - "api_key": "${env:DEEPGRAM_API_KEY}", - "model": "nova-2", - "language": "en-US", - "sample_rate": 16000 - } -} -``` - -| Field | Required | Purpose | -| ----------------- | -------- | ------------------------------------------------- | -| `type` | Yes | Always `"extension"` | -| `name` | Yes | Instance name (used in connections) | -| `addon` | Yes | Extension package name (must match manifest.json) | -| `extension_group` | No | Thread grouping for extensions | -| `property` | No | Config overrides merged with extension defaults | - -## Connection Schema - -Connections define how messages flow between extensions: - -```json -{ - "extension": "main", - "cmd": [ - {"name": "flush", "dest": [{"extension": "llm"}, {"extension": "tts"}]}, - {"name": "on_user_joined", "source": [{"extension": "agora_rtc"}]} - ], - "data": [ - {"name": "text_data", "source": [{"extension": "llm"}]}, - {"name": "text_data", "dest": [{"extension": "tts"}]} - ] -} -``` - -Each connection block is **from the perspective of the named extension**: -- `source` — "this extension receives X from these sources" -- `dest` — "this extension sends X to these destinations" - -## Full Graph Example - -A basic voice assistant pipeline (ASR → LLM → TTS): - -```json -{ - "name": "voice_assistant", - "auto_start": false, - "graph": { - "nodes": [ - { - "type": "extension", "name": "agora_rtc", "addon": "agora_rtc", - "extension_group": "rtc_group", - "property": {"app_id": "${env:AGORA_APP_ID}", "channel": "default"} - }, - { - "type": "extension", "name": "stt", "addon": "deepgram_asr_python", - "extension_group": "stt_group", - "property": {"api_key": "${env:DEEPGRAM_API_KEY}", "model": "nova-2"} - }, - { - "type": "extension", "name": "llm", "addon": "openai_llm2_python", - "extension_group": "llm_group", - "property": {"api_key": "${env:OPENAI_API_KEY}", "model": "${env:OPENAI_MODEL}"} - }, - { - "type": "extension", "name": "tts", "addon": "elevenlabs_tts2_python", - "extension_group": "tts_group", - "property": {"api_key": "${env:ELEVENLABS_TTS_KEY}"} - } - ], - "connections": [ - { - "extension": "agora_rtc", - "audio_frame": [ - {"name": "pcm_frame", "dest": [{"extension": "stt"}]} - ] - }, - { - "extension": "stt", - "data": [ - {"name": "asr_result", "dest": [{"extension": "main"}]} - ] - }, - { - "extension": "main", - "cmd": [ - {"name": "flush", "dest": [{"extension": "llm"}, {"extension": "tts"}]}, - {"name": "on_user_joined", "source": [{"extension": "agora_rtc"}]} - ], - "data": [ - {"name": "text_data", "source": [{"extension": "llm"}]}, - {"name": "text_data", "dest": [{"extension": "tts"}]} - ] - }, - { - "extension": "tts", - "data": [ - {"name": "tts_text_input", "source": [{"extension": "main"}]} - ], - "audio_frame": [ - {"name": "pcm_frame", "dest": [{"extension": "agora_rtc"}]} - ] - } - ] - } -} -``` - -## Connection Types Reference - -| Type | Direction | Payload | Example Names | -| ------------- | --------- | ------------------ | ----------------------------------- | -| `cmd` | Both | Named commands | `flush`, `tool_register`, `on_user_joined`, `chat_completion_call`, `update_configs` | -| `data` | Both | Named data msgs | `asr_result`, `text_data`, `tts_text_input`, `tts_audio_start`, `tts_audio_end`, `error` | -| `audio_frame` | Both | PCM audio streams | `pcm_frame` | -| `video_frame` | Both | Video streams | `video_frame` | - -## Parallel Audio Routing - -When sending audio to multiple destinations, split at the **source node**: - -```json -// CORRECT — split at agora_rtc (source) -{ - "extension": "agora_rtc", - "audio_frame": [ - {"name": "pcm_frame", "dest": [ - {"extension": "stt"}, - {"extension": "vad"} - ]} - ] -} -``` - -Do NOT split from intermediate nodes — this causes runtime crashes. - -## Property Injection - -When the server processes a `/start` request, it dynamically modifies the graph: - -1. **Graph selection**: Filters `predefined_graphs` to match `graph_name`, sets `auto_start: true` -2. **Channel injection**: Scans all nodes — any node with a `"channel"` property gets `channel_name` injected -3. **Start params**: Injects `remote_stream_id`, `bot_stream_id`, `token` via `startPropMap` -4. **Extension overrides**: Merges `req.Properties[extensionName]` into matching node properties -5. **Env var validation**: Resolves all `${env:VAR}` references - -This is why `agora_rtc` and any custom extension with a `"channel"` property automatically -receive the dynamic channel name without code changes. - -## Adding a New Graph - -1. Add a new entry to `predefined_graphs[]` in the example's `tenapp/property.json` -2. Ensure all referenced extensions are listed in `tenapp/manifest.json` -3. Run `tman install` to create symlinks for new dependencies -4. **Nuclear restart** required (frontend caches the graph list) - -## Generating property.json with rebuild_property.py - -For complex deployments with many graph variants, hand-editing property.json is -error-prone. The `voice-assistant-advanced` example uses a Python script to generate -it programmatically: - -**Location**: `agents/examples/voice-assistant-advanced/tenapp/rebuild_property.py` - -**Usage**: -```bash -docker exec ten_agent_dev bash -c \ - "cd /app/agents/examples/voice-assistant-advanced/tenapp && python3 rebuild_property.py" -``` - -### How It Works - -The script defines reusable **node configs** as Python dicts, then assembles them -into graphs with helper functions: - -```python -# 1. Define reusable node configs -nova3_stt_100ms = { - "type": "extension", "name": "stt", "addon": "deepgram_ws_asr_python", - "extension_group": "stt", - "property": { - "params": { - "api_key": "${env:DEEPGRAM_API_KEY}", - "model": "nova-3", "language": "en-US", - "interim_results": True, "endpointing": 100, - } - }, -} - -cartesia_tts_sonic3 = { - "type": "extension", "name": "tts", "addon": "cartesia_tts", - "extension_group": "tts", - "property": { - "dump": False, "dump_path": "./", - "params": { - "api_key": "${env:CARTESIA_TTS_KEY}", - "model_id": "sonic-3", - "output_format": {"container": "raw", "sample_rate": 44100}, - }, - }, -} - -gpt51_llm = { - "type": "extension", "name": "llm", "addon": "openai_llm2_python", - "extension_group": "chatgpt", - "property": { - "base_url": "https://api.openai.com/v1", - "api_key": "${env:OPENAI_API_KEY}", - "model": "gpt-5.1", "max_tokens": 1000, - "prompt": "...", "greeting": "...", - }, -} - -# 2. Define reusable connection templates -basic_connections = [ - {"extension": "main_control", "cmd": [...], "data": [...]}, - {"extension": "agora_rtc", "audio_frame": [...], "data": [...]}, - {"extension": "streamid_adapter", "audio_frame": [...]}, - {"extension": "tts", "data": [...], "audio_frame": [...]}, - # ... -] - -# 3. Assemble graphs with helper functions -def create_basic_voice_assistant(name, has_avatar=False, avatar_type=None, - tts_config=None, stt_config=None, llm_config=None): - nodes = [agora_rtc_base, stt_config or nova3_stt_100ms, llm_config or ..., ...] - connections = copy.deepcopy(basic_connections) - if has_avatar: - # Modify connections: route TTS audio through avatar instead of direct to RTC - ... - return {"name": name, "auto_start": False, "graph": {"nodes": nodes, "connections": connections}} - -# 4. Build graph list and write property.json -new_graphs = [ - create_basic_voice_assistant("voice_assistant"), - create_basic_voice_assistant("voice_assistant_heygen", has_avatar=True, avatar_type="heygen"), - create_apollo_graph("flux_apollo_gpt_5_1_cartesia", gpt51_llm, flux_stt), - # ... -] - -new_data = {"ten": {"log": log_config, "predefined_graphs": new_graphs}} -with open("property.json", "w") as f: - json.dump(new_data, f, indent=2) -``` - -### Key Patterns in rebuild_property.py - -| Pattern | Purpose | -| ---------------------------- | ---------------------------------------------------- | -| `copy.deepcopy(config)` | Prevent mutation when reusing node configs | -| Parametric helper functions | `create_basic_voice_assistant(name, tts_config=...)` | -| Connection rewiring for avatars | Route TTS audio through avatar instead of direct to RTC | -| Preserve existing log config | `log_config = data["ten"]["log"]` before overwriting | -| Commented-out graph groups | Keep old graph definitions for reference/reactivation| - -### When to Use rebuild_property.py - -- **Multiple graph variants** (A/B testing vendors: Deepgram vs Cartesia TTS) -- **Avatar variants** (same pipeline with/without HeyGen/Anam) -- **LLM model testing** (GPT-4o vs GPT-5.1 vs Groq) -- **Complex connection rewiring** (avatar graphs need different audio routing) - -For simple single-graph setups, editing property.json directly is fine. - -## Manifest.json Dependencies - -When adding an extension to a graph, ensure its dependency is in `manifest.json`: - -```json -{ - "dependencies": [ - {"type": "extension", "name": "my_vendor_tts_python", "version": "0.1.0"} - ] -} -``` - -Then run: -```bash -docker exec ten_agent_dev bash -c "cd /app/agents/examples//tenapp && tman install" -``` - -## Main Extension Customization - -The "main" extension controls agent orchestration. Three variants exist: - -| Variant | Language | Pattern | Use Case | -| -------------------- | ---------- | ---------------------------- | -------------------------- | -| Python Cascade | Python | ASR → LLM → TTS pipeline | Standard voice assistant | -| Python Realtime V2V | Python | OpenAI Realtime API | Voice-to-voice (no ASR/TTS)| -| Node.js Cascade | TypeScript | ASR → LLM → TTS pipeline | TypeScript preference | - -Key customization points: -- `on_data()` — event routing (match/case dispatcher) -- `on_cmd()` — tool registration and handling -- Greeting logic in `on_start()` or `on_user_joined` handler - -## Example Apps - -Available in `agents/examples/`. Key examples: - -| Example | Description | -| --------------------------------- | ---------------------------------------------------- | -| `voice-assistant` | Basic: Deepgram ASR + OpenAI LLM + ElevenLabs TTS | -| `voice-assistant-advanced` | Multiple graph variants, vendor A/B testing | -| `voice-assistant-realtime` | OpenAI Realtime API (voice-to-voice, no ASR/TTS) | -| `voice-assistant-video` | Vision capability added | -| `voice-assistant-live2d` | Live2D avatar integration | -| `voice-assistant-sip-twilio` | SIP phone integration (Twilio) | -| `voice-assistant-sip-telnyx` | SIP phone integration (Telnyx) | -| `voice-assistant-sip-plivo` | SIP phone integration (Plivo) | -| `voice-assistant-with-ten-vad` | Custom VAD (Voice Activity Detection) | -| `voice-assistant-with-turn-detection` | Transformer-based turn detection | -| `voice-assistant-nodejs` | Node.js implementation | -| `doodler` | Spoken prompts → hand-drawn sketches | -| `speaker-diarization` | Real-time multi-speaker identification | -| `transcription` | Audio transcription tool | -| `websocket-example` | WebSocket transport (no Agora RTC) | -| `http-control` | HTTP-based control interface | - -### voice-assistant vs voice-assistant-advanced - -| Aspect | voice-assistant | voice-assistant-advanced | -| --------------------- | --------------------------- | --------------------------------- | -| Graphs | 1 (`voice_assistant`) | 4+ variants (Flux/Apollo/Cartesia)| -| Vendor switching | Fixed components | Multiple vendor combinations | -| LLM prompts | Simple greeting | Multi-step research workflows | -| Use case | Getting started | Production A/B testing | - -Both follow the same core pipeline: -``` -Agora RTC → streamid_adapter → ASR → main_control → LLM → TTS → Agora RTC -``` - -### Real Graph: voice-assistant/tenapp/property.json - -This is a complete, working graph. Key nodes: - -| Node | Addon | Role | -| ------------------ | ------------------------ | ---------------------------------- | -| `agora_rtc` | `agora_rtc` | Audio/video transport | -| `streamid_adapter` | `streamid_adapter` | Stream ID routing | -| `stt` | `deepgram_asr_python` | Speech-to-text | -| `llm` | `openai_llm2_python` | Language model | -| `tts` | `elevenlabs_tts2_python` | Text-to-speech | -| `main_control` | `main_python` | Orchestration (greetings, routing) | -| `message_collector` | `message_collector2` | Transcript collection | - -Connection wiring: -``` -agora_rtc --pcm_frame--> streamid_adapter --pcm_frame--> stt -stt --asr_result--> main_control -main_control --text_data--> llm --text_data--> main_control --tts_text_input--> tts -tts --pcm_frame--> agora_rtc -``` - -## Portal References - -- [Understanding property.json](https://github.com/TEN-framework/portal/blob/main/content/docs/ten_agent_examples/project_structure/property_json.md) [EXTERNAL] -- [Customize Agent via Code](https://github.com/TEN-framework/portal/blob/main/content/docs/ten_agent_examples/customize_agent/modify-main/index.mdx) [EXTERNAL] - -## See Also - -- [Back to Architecture](../02_architecture.md) -- [Back to Workflows](../05_workflows.md) -- [Back to Interfaces](../06_interfaces.md) diff --git a/docs/ai/L1/deep_dives/operations_restarts.md b/docs/ai/L1/deep_dives/operations_restarts.md deleted file mode 100644 index c4bff1c10f..0000000000 --- a/docs/ai/L1/deep_dives/operations_restarts.md +++ /dev/null @@ -1,191 +0,0 @@ -# Operations and Restarts - -> **When to Read This:** Load this document when you need to restart services, -> debug port conflicts, recover from crashes, or clean up zombie processes. - -## When to Do a Full Restart - -| What Changed | Action | -| ------------------------------- | ---------------------------------------------------- | -| `property.json` (graphs added) | Full restart (frontend caches graph list) | -| `property.json` (config only) | No restart needed (loaded per session) | -| `.env` | `docker compose down && docker compose up -d` + deps | -| Python code | Restart server only | -| Go code | `task install` then restart server | -| Container restart | Reinstall Python deps, then `task run` | - -## Full Restart Procedure - -Must kill `next-server` too — it holds port 3000 even after `node`/`bun` die: - -```bash -# 1. Kill EVERYTHING -sudo docker exec ten_agent_dev bash -c \ - "pkill -9 -f 'bin/api'; pkill -9 -f bun; pkill -9 -f node; \ - pkill -9 -f next-server; pkill -9 -f tman" - -# 2. Clean up stale files -sudo docker exec ten_agent_dev bash -c "rm -f /app/playground/.next/dev/lock" - -# 3. Wait for port 3000 TIME_WAIT to clear -# If Next.js can't bind port 3000, it silently starts on 3001/3002 -# which isn't exposed by Docker — the frontend appears down. -sleep 30 - -# 4. Start -sudo docker exec -d ten_agent_dev bash -c \ - "cd /app/agents/examples/ && task run > /tmp/task_run.log 2>&1" - -# 5. Verify (wait ~12s for startup) -sleep 12 -sudo docker exec ten_agent_dev bash -c \ - "curl -s http://localhost:8080/health && \ - curl -s -o /dev/null -w ' Frontend:%{http_code}' http://localhost:3000/" -``` - -## Verification - -Check Next.js started on port 3000 (not 3001+): - -```bash -sudo docker exec ten_agent_dev bash -c \ - "strings /tmp/task_run.log | grep -E 'Local:|Port|Ready|Error'" -``` - -Expected output: -``` - - Local: http://localhost:3000 - Ready in 2.1s -``` - -If you see `Port 3000 is in use`, the frontend is on the wrong port. - -## Zombie Worker Cleanup - -Worker processes (`bin/main`) run inside Docker but can survive server restarts: - -```bash -# Check for zombies -sudo docker exec ten_agent_dev bash -c \ - "ps aux | grep 'bin/main' | grep -v grep" - -# Kill them -sudo docker exec ten_agent_dev bash -c \ - "pkill -9 -f 'bin/main'" -``` - -Always kill zombies before restarting the server. - -## Stale Lock Cleanup - -After crashes, `.next/dev/lock` becomes stale: - -```bash -sudo docker exec ten_agent_dev bash -c "rm -f /app/playground/.next/dev/lock" -``` - -Also clear the Next.js cache if React version errors appear: - -```bash -sudo docker exec ten_agent_dev bash -c "rm -rf /app/playground/.next" -``` - -## Port 3000 Conflict Debugging - -If Next.js reports "Port 3000 is in use", find the process holding it: - -```bash -sudo docker exec ten_agent_dev bash -c \ - "for pid in /proc/[0-9]*/fd/*; do \ - link=\$(readlink \$pid 2>/dev/null); \ - echo \"\$link\" | grep -q socket: && \ - inode=\$(echo \$link | grep -oP '\\d+') && \ - grep -q \$inode /proc/net/tcp6 2>/dev/null && \ - grep \$inode /proc/net/tcp6 | grep -q ':0BB8' && \ - echo PID=\$(echo \$pid | cut -d/ -f3) && break; \ - done" -``` - -Kill the PID, wait for TIME_WAIT to clear (~30s), then restart. - -If no PID is found but port is still busy, it's in TIME_WAIT state. Check: - -```bash -sudo docker exec ten_agent_dev bash -c \ - "cat /proc/net/tcp6 | grep ':0BB8'" -``` - -State `06` = TIME_WAIT. Wait 30-60 seconds for it to clear. - -## .env and Container Restart Recovery - -`.env` is loaded at container startup only. After editing: - -```bash -cd /home/ubuntu/ten-framework/ai_agents -docker compose down && docker compose up -d -``` - -Then reinstall everything (Python deps are not persisted): - -```bash -sudo docker exec ten_agent_dev bash -c \ - "cd /app/agents/examples/ && task install" -``` - -## Copying Extension Code to Running Container - -When iterating on extension code locally: - -```bash -# Option 1: docker cp with /. suffix (avoids nested dirs) -sudo docker cp ./agents/ten_packages/extension/my_ext/. \ - ten_agent_dev:/app/agents/ten_packages/extension/my_ext/ - -# Option 2: tar with cache exclusion (recommended — avoids -# __pycache__ and .pytest_cache causing import errors) -tar --exclude='__pycache__' --exclude='.pytest_cache' \ - -C ai_agents/agents/ten_packages/extension/my_ext -cf - . | \ - sudo docker exec -i ten_agent_dev tar \ - -C /app/agents/ten_packages/extension/my_ext -xf - - -# Verify symlink exists in the example's tenapp -sudo docker exec ten_agent_dev bash -c \ - "ls -la /app/agents/examples//tenapp/ten_packages/extension/my_ext" - -# If missing, create it manually -sudo docker exec ten_agent_dev bash -c \ - "ln -sf /app/agents/ten_packages/extension/my_ext \ - /app/agents/examples//tenapp/ten_packages/extension/my_ext" -``` - -Then do a full restart. - -**Common pitfall:** If `docker cp` copies `__pycache__` or `.pytest_cache` -from your local machine into the container, it can cause `ImportError` or -stale bytecode during test collection. Use the tar method above or clean -the container directory before copying: - -```bash -sudo docker exec ten_agent_dev bash -c \ - "find /app/agents/ten_packages/extension/my_ext \ - -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null; \ - find /app/agents/ten_packages/extension/my_ext \ - -type d -name .pytest_cache -exec rm -rf {} + 2>/dev/null" -``` - -## After Container Restart Checklist - -1. Reinstall Python dependencies -2. Rebuild Go binary (`task install`) -3. Kill any zombie workers -4. Remove stale `.next/dev/lock` -5. Start with `task run` -6. Verify health endpoint and frontend status code - -## See Also - -- [Back to Gotchas](../07_gotchas.md) -- [Back to Workflows](../05_workflows.md) -- [Deployment](deployment.md) — Production setup, Cloudflare, Nginx -- [Server Architecture](server_architecture.md) — Worker lifecycle diff --git a/docs/ai/L1/deep_dives/server_architecture.md b/docs/ai/L1/deep_dives/server_architecture.md deleted file mode 100644 index 18bebf0a15..0000000000 --- a/docs/ai/L1/deep_dives/server_architecture.md +++ /dev/null @@ -1,211 +0,0 @@ -# Server Architecture - -> **When to Read This:** Load this document when you need to understand how the Go API -> server works, how property injection transforms graph configurations at runtime, or -> how worker processes are managed. - -## Overview - -The TEN Agent server is a Go HTTP server built with the Gin framework. It manages -agent session lifecycles — starting worker processes, injecting configuration, and -handling session keepalive/teardown. - -## Server Structure - -``` -server/ -├── main.go # Entry point, parses flags, starts HTTP server -└── internal/ - ├── http_server.go # All endpoint handlers + property injection - └── config.go # startPropMap configuration for parameter injection -``` - -Key launch flag: `-tenapp_dir=` — points to the example's `tenapp/` directory -containing `property.json` and `manifest.json`. - -## Endpoint Handlers - -| Handler | Route | Purpose | -| -------------------------------- | ------------------ | ----------------------------------- | -| `handlerHealth()` | `GET /health` | Returns `{"code":"0"}` if running | -| `handleGraphs()` | `GET /graphs` | Reads predefined_graphs from property.json | -| `handlerStart()` | `POST /start` | Spawns worker process for a session | -| `handlerStop()` | `POST /stop` | Terminates worker process | -| `handlerPing()` | `POST /ping` | Resets session timeout timer | -| `handlerList()` | `GET /list` | Lists active workers/channels | -| `handlerGenerateToken()` | `POST /token/generate` | Generates Agora RTC tokens | -| `handleAddonDefaultProperties()` | `GET /addon/default-properties` | Extension property.json files | -| `handlerVectorDocumentUpdate()` | `POST /vector/document/update` | Vector DB updates | -| `handlerVectorDocumentUpload()` | `POST /vector/document/upload` | File uploads for vector DB | - -## Property Injection Pipeline - -When `/start` is called, the server transforms the static `property.json` into a -session-specific configuration. This is the core of the `processProperty` function: - -### Step 1: Read Base Configuration - -```go -// Read property.json from the configured tenapp_dir -propertyJsonFile := filepath.Join(s.config.TenappDir, "property.json") -content, _ := os.ReadFile(propertyJsonFile) -``` - -### Step 2: Filter Graphs - -Only the requested graph is kept; its `auto_start` is set to `true`: - -```go -// Find matching graph by name -for _, graph := range predefinedGraphs { - if graph.Name == req.GraphName { - graph.AutoStart = true - filteredGraphs = append(filteredGraphs, graph) - } -} -``` - -### Step 3: Merge Dynamic Properties - -Per-extension property overrides from the request are merged: - -```go -// req.Properties = {"openai_llm2_python": {"model": "gpt-4o-mini"}} -for _, node := range graph.Nodes { - if props, ok := req.Properties[node.Name]; ok { - mergeProperties(node.Property, props) - } -} -``` - -### Step 4: Inject Start Parameters - -The `startPropMap` (defined in `config.go`) maps request fields to node properties: - -```go -var startPropMap = map[string]string{ - "RemoteStreamId": "remote_stream_id", - "BotStreamId": "agora_uid", - "Token": "token", - "WorkerHttpServerPort": "server_port", -} -``` - -These values are injected into every node that has the corresponding property defined. - -### Step 5: Channel Auto-Injection - -Any node with a `"channel"` property automatically receives the request's `channel_name`: - -```go -// Scan all nodes — if node has "channel" property, inject channel_name -for _, node := range graph.Nodes { - if _, hasChannel := node.Property["channel"]; hasChannel { - node.Property["channel"] = req.ChannelName - } -} -``` - -This is future-proof: adding a new extension with a `"channel"` property requires -zero server code changes. - -### Step 6: Environment Variable Resolution - -All `${env:VAR}` and `${env:VAR|default}` references in the property JSON are -resolved against the container's environment. - -### Step 7: Write Temp File and Spawn Worker - -The modified property JSON is written to a temporary file, and a worker process -is spawned: - -```go -// Write modified config -tmpFile := filepath.Join(tmpDir, "property.json") -os.WriteFile(tmpFile, modifiedJSON, 0644) - -// Spawn worker -cmd := exec.Command("tman", "run", "start", "--property", tmpFile) -``` - -## Worker Process Lifecycle - -``` -/start request - │ - ▼ -Server: processProperty() → temp property.json - │ - ▼ -Server: exec("tman run start --property ") - │ - ▼ -Worker process starts → loads graph → initializes extensions - │ - ├── Extensions call on_init() → on_start() - ├── Extensions process messages (cmd, data, audio_frame, video_frame) - │ - ├── /ping requests reset the timeout timer - │ - ▼ -/stop request OR timeout - │ - ▼ -Worker: extensions call on_stop() → on_deinit() - │ - ▼ -Worker process terminates -``` - -**Important**: Worker processes run on the **host machine**, not inside Docker. -They can outlive the server process and even container restarts. Always check for -zombie workers with `ps -elf | grep 'bin/main'`. - -## Session Management - -| Action | Server Behavior | -| -------------- | -------------------------------------------------- | -| `/start` | Spawns worker, stores in active workers map | -| `/stop` | Sends SIGTERM to worker, removes from map | -| `/ping` | Resets timeout timer for the channel | -| Timeout | Auto-sends SIGTERM after `timeout` seconds idle | -| `/list` | Returns all active channel → worker mappings | - -Timeout of `-1` means the session never auto-stops (requires explicit `/stop`). - -## LOG_STDOUT for Worker Output - -Worker processes write to stdout. To see their output in `/tmp/task_run.log`, -the `.env` must have: - -```bash -LOG_STDOUT=true -``` - -Without this, extension logs (Python `print()`, `ten_env.log_*()`) are invisible. - -## Security Measures - -- **Path traversal prevention**: The server ignores any client-provided `tenapp_dir` - and always uses the launch-configured path -- **Channel name sanitization**: Channel names are validated before use in file paths -- **Safe property merge**: `mergeProperties()` handles nested configs safely with - type checking - -## Configuration (config.go) - -The `startPropMap` in `config.go` controls which request fields map to which -node properties: - -| Request Field | Node Property | Purpose | -| ---------------------- | -------------------- | ------------------------------ | -| `RemoteStreamId` | `remote_stream_id` | Remote user's stream ID | -| `BotStreamId` | `agora_uid` | Bot's Agora UID | -| `Token` | `token` | Agora RTC token | -| `WorkerHttpServerPort` | `server_port` | Worker's HTTP server port | - -## See Also - -- [Back to Architecture](../02_architecture.md) -- [Graph Configuration](graph_configuration.md) — Property.json structure and connections -- [Back to Interfaces](../06_interfaces.md) diff --git a/docs/ai/L1/deep_dives/testing.md b/docs/ai/L1/deep_dives/testing.md deleted file mode 100644 index 7c77e5ce0d..0000000000 --- a/docs/ai/L1/deep_dives/testing.md +++ /dev/null @@ -1,305 +0,0 @@ -# Testing - -> **When to Read This:** Load this document when you need to run tests for an extension, -> understand what the guarder tests validate, or debug test failures. - -## Overview - -Three levels of testing: -1. **Extension standalone tests** — per-extension unit/integration tests in `tests/` -2. **Guarder integration tests** — framework-level ASR/TTS validation suites -3. **Root-level tasks** — orchestrated via `Taskfile.yml` - -## Running Tests - -```bash -# All tests -docker exec ten_agent_dev bash -c "cd /app && task test" - -# Single extension with dependency install -docker exec ten_agent_dev bash -c \ - "cd /app && task test-extension EXTENSION=agents/ten_packages/extension/deepgram_tts" - -# Single extension, skip install (faster iteration) -docker exec ten_agent_dev bash -c \ - "cd /app && task test-extension-no-install EXTENSION=agents/ten_packages/extension/deepgram_tts" - -# TTS guarder (16 tests) -docker exec ten_agent_dev bash -c "cd /app && task tts-guarder-test EXTENSION=deepgram_tts" - -# ASR guarder (10 tests) -docker exec ten_agent_dev bash -c "cd /app && task asr-guarder-test EXTENSION=azure_asr_python" - -# Specific test only (faster iteration on failures) -docker exec ten_agent_dev bash -c "cd /app && task tts-guarder-test EXTENSION=deepgram_tts -- -k test_flush" -``` - -**Before running tests**, sync your local code into the container. Use tar -to exclude cache artifacts that cause import errors: - -```bash -tar --exclude='__pycache__' --exclude='.pytest_cache' \ - -C ai_agents/agents/ten_packages/extension/my_ext -cf - . | \ - sudo docker exec -i ten_agent_dev tar \ - -C /app/agents/ten_packages/extension/my_ext -xf - -``` - -## Extension Standalone Tests - -Each extension can have `tests/` with a `bin/start` entry point: - -``` -my_extension/tests/ -├── bin/start # Sets PYTHONPATH, runs pytest -├── configs/ # Test config JSON files -│ ├── property.json -│ ├── property_basic_audio_setting1.json -│ ├── property_basic_audio_setting2.json -│ ├── property_dump.json -│ ├── property_miss_required.json -│ └── property_invalid.json -├── conftest.py # Fixtures -└── test_*.py # Test files -``` - -### PYTHONPATH - -Tests need this to import TEN runtime: - -```bash -export PYTHONPATH=".:ten_packages/system/ten_runtime_python/lib:\ -ten_packages/system/ten_runtime_python/interface:\ -ten_packages/system/ten_ai_base/interface:\ -ten_packages/extension/${EXT_NAME}:$PYTHONPATH" -``` - ---- - -## TTS Guarder Tests (15 Tests) - -**Location**: `agents/integration_tests/tts_guarder/` - -These tests run against any TTS extension. The manifest template (`manifest-tmpl.json`) -substitutes `{{extension_name}}` with your extension name at runtime. - -### Test Inventory - -| # | Test | What It Validates | Pass Criteria | -|---|------|-------------------|---------------| -| 1 | `test_append_input` | Multiple texts appended with same request_id | audio_start -> frames -> audio_end per group, correct request_id | -| 2 | `test_append_input_stress` | High volume append operations | All appends processed without errors | -| 3 | `test_append_input_without_text_input_end` | Missing text_input_end flag | Processes correctly despite missing flags | -| 4 | `test_append_interrupt` | New requests interrupting in-progress ones | Interrupts handled without crash or malformed audio | -| 5 | `test_basic_audio_setting` | Different sample rates produce different audio | Two configs with different sample_rate yield different output rates | -| 6 | `test_corner_input` | Special chars, emojis, punctuation-only, very short/long | All processed without errors | -| 7 | `test_dump` | Audio dump file creation | Dump file exists, contains valid PCM, size matches duration | -| 8 | `test_dump_each_request_id` | Separate dump files per request_id | Each request_id has own dump file | -| 9 | `test_empty_text_request` | Empty/whitespace text | audio_end within 500ms, no audio data, no crash | -| 10 | `test_flush` | Flush signal handling | Receives flush_end with matching flush_id, no data for 5s after | -| 11 | `test_interleaved_requests` | 8 concurrent requests with different request_ids | Each maintains separate audio stream, correct ordering per request | -| 12 | `test_invalid_required_params` | Invalid API key | Returns FATAL ERROR with message, no crash | -| 13 | `test_invalid_text_handling` | Malformed text, null chars, very long strings | Handled gracefully without crash | -| 14 | `test_metrics` | TTFB metric generation | Metrics data present with valid timestamps | -| 15 | `test_miss_required_params` | Missing API key | Appropriate error returned | - -### Critical TTS Invariants - -1. **Event ordering must be**: `tts_audio_start` -> `pcm_frame`(s) -> `tts_audio_end` per request -2. **Request isolation**: Interleaved requests must never mix audio streams -3. **Error handling**: Invalid/missing configs produce errors, never crashes -4. **Empty text**: Must complete fast (audio_end within 500ms), generate no audio -5. **Flush**: After flush_end, zero data output for 5 seconds - -### Required TTS Config Files - -Your `tests/configs/` must provide: - -``` -property.json # Valid API key + default settings -property_basic_audio_setting1.json # sample_rate: 16000 + valid key + dump:true -property_basic_audio_setting2.json # sample_rate: 24000 + valid key + dump:true -property_dump.json # dump:true + dump_path + valid key -property_miss_required.json # Empty/missing API key -property_invalid.json # Empty/invalid API key -``` - -**Template** (`property_basic_audio_setting1.json`): -```json -{ - "dump": true, - "dump_path": "./tests/keep_dump_output/", - "params": { - "sample_rate": 16000, - "key": "${env:MY_VENDOR_API_KEY}" - } -} -``` - -### Sample Rate Test Notes - -Some extensions don't support multiple sample rates. To skip the sample rate -comparison (test still runs, just doesn't assert rates differ), the test runner -checks `ENABLE_SAMPLE_RATE` env var. Extensions like `openai_tts_python` and -`humeai_tts_python` set this to `False`. - ---- - -## ASR Guarder Tests (10 Tests, 1 Skipped) - -**Location**: `agents/integration_tests/asr_guarder/` - -### Test Audio Format - -- 16-bit PCM, 16kHz sample rate, mono -- Test files: `test_data/16k_en_us.pcm` (English), `test_data/16k_zh_cn.pcm` (Chinese) -- Chunk size: 320 bytes per frame -- Send interval: 10ms between frames - -### Test Inventory - -| # | Test | What It Validates | Pass Criteria | -|---|------|-------------------|---------------| -| 1 | `test_connection_timing` | Connect + transcribe English audio | Results received, language="en-US" | -| 2 | `test_asr_result` | Result structure and data integrity | Fields: id, text, language, session_id all present | -| 3 | `test_asr_finalize` | Finalize signal → final result + finalize_end | final=True in result, finalize_end received | -| 4 | `test_reconnection` | Recovery after connection failure | Error detected, no crash, can reconnect | -| 5 | `test_vendor_error` | Invalid creds → proper error format | Error has id, module, code, message + vendor info | -| 6 | `test_multi_language` | English + Chinese transcription | en-US and zh-CN both detected correctly | -| 7 | `test_dump` | Audio dump functionality | Dump files created with correct data | -| 8 | `test_metrics` | TTFW and TTLW metrics | TTFW > 0, TTLW > TTFW, both in milliseconds | -| 9 | `test_audio_timestamp` | start_ms and duration_ms accuracy | Timestamps accurate within tolerance | -| 10 | `test_long_duration_stream` | **SKIPPED** — 5+ min stream | No timeout or connection drop | - -### Critical ASR Invariants - -1. **Result fields**: Every result must have `id`, `text`, `language`, `session_id` -2. **Finalize flow**: `asr_finalize` cmd -> `final=True` result -> `asr_finalize_end` response -3. **Error format**: `{id, module, code, message, vendor_info: {vendor, code, message}}` -4. **Metrics**: TTFW (Time To First Word) > 0, TTLW (Time To Last Word) > TTFW - -### Required ASR Config Files - -``` -property_en.json # Valid key + language: "en-US" -property_zh.json # Valid key + language: "zh-CN" -property_invalid.json # key: "invalid" (triggers vendor error test) -property_dump.json # Valid key + dump: true -``` - -**Template** (`property_en.json` for Deepgram): -```json -{ - "params": { - "key": "${env:DEEPGRAM_API_KEY}", - "model": "nova-2", - "sample_rate": 16000, - "encoding": "linear16", - "language": "en-US" - } -} -``` - ---- - -## Guarder Test Framework Internals - -### Manifest Template System - -Both guarders use template manifests with `{{extension_name}}` placeholders: - -```json -{ - "type": "app", - "name": "tts_guarder", - "version": "0.1.0", - "dependencies": [ - {"path": "../../ten_packages/extension/{{extension_name}}"} - ] -} -``` - -The Taskfile substitutes this at runtime with `sed`. - -### conftest.py Pattern - -Both guarders use a session-scoped FakeApp: - -```python -@pytest.fixture(scope="session", autouse=True) -def global_setup_and_teardown(): - event = threading.Event() - fake_app_ctx = FakeAppCtx(event) - fake_app_thread = threading.Thread(target=run_fake_app, args=(fake_app_ctx,)) - fake_app_thread.start() - event.wait() - yield - fake_app_ctx.fake_app.close() - fake_app_thread.join() -``` - -Each test creates its own `ExtensionTester` within this shared app context. -Tests share the session-scoped app but get fresh extension instances. - -### Pytest Options - -- `--extension_name` — extension to test (required) -- `--config_dir` — path to configs directory (required) -- `--enable_sample_rate` — "True"/"False" for sample rate comparison (TTS only) - ---- - -## Common Test Failures and Fixes - -### "Timeout waiting for audio" -- **Cause**: External API not responding within timeout -- **Fix**: Check API key is valid, check network, increase timeout if needed -- **Note**: Some flakiness is expected with external APIs — run individually to confirm - -### "Received error data" / FATAL ERROR -- **Cause**: Extension detected invalid config and raised error (this is correct behavior for error tests) -- **Fix**: If this happens on non-error tests, check your config files have valid API keys - -### "Found N dump files, expected M" -- **Cause**: Some requests timed out and didn't produce dump files -- **Fix**: Usually API timeout flakiness — rerun the test - -### "Received additional data after flush_end" -- **Cause**: Extension sent audio data after it should have stopped -- **Fix**: Ensure your cancel_tts/flush handling stops all pending output immediately - -### "Test failed: sample rates are the same" -- **Cause**: Your extension ignores the sample_rate config -- **Fix**: Implement sample_rate support, or set ENABLE_SAMPLE_RATE=False if your API doesn't support it - -### Import errors -- **Cause**: PYTHONPATH doesn't include ten_runtime_python and ten_ai_base -- **Fix**: Check `tests/bin/start` script sets PYTHONPATH correctly - -### "ModuleNotFoundError: No module named 'ten_packages.extension.xxx'" -- **Cause**: Extension not installed in test environment -- **Fix**: Run `tman install --standalone` in extension directory, or use `task test-extension` (does it automatically) - ---- - -## CI/CD Pipeline - -### Manual Guarder Tests (GitHub Actions) - -ASR and TTS guarder tests can be triggered manually: - -- Workflow: `.github/workflows/manual_test_asr_guarder.yml` -- Inputs: `extension` name, `config_dir`, `branch`, `env_vars` (semicolon-separated secret names) -- API keys loaded from GitHub Secrets at runtime - -### Extension Publishing - -- Workflow: `.github/workflows/manual_publish_extension.yml` -- Steps: `tman install --standalone` -> `tman run build` -> `tman publish` -- Requires `TEN_CLOUD_STORE` secret for publishing - ---- - -## See Also - -- [Extension Development](extension_development.md) — Config files and pre-submission checklist -- [Back to Workflows](../05_workflows.md) From d66c783e1669e084a4952599ca9fe0d4307b5c44 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 16:23:56 +0000 Subject: [PATCH 15/18] fix: move cancel flag reset to just before ws.send clear _is_cancelled just before sending Speak+Flush, not at method entry. prevents a concurrent cancel() from being lost if it races with get() starting up. --- .../ten_packages/extension/deepgram_tts/deepgram_tts.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py index 060e726b3c..cf82a000f0 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py @@ -138,9 +138,7 @@ async def get( yield None, EVENT_TTS_END return - self._is_cancelled = False - - # Reconnect if needed (new request_id or after error) + # Reconnect if needed (after error or cancel) if self._needs_reconnect: await self._reconnect() self._needs_reconnect = False @@ -150,6 +148,10 @@ async def get( if not self._ttfb_sent: self._sent_ts = datetime.now() + # Clear cancel flag just before sending, not at + # method entry — avoids race with concurrent cancel() + self._is_cancelled = False + # Send Speak + Flush speak_msg = {"type": "Speak", "text": text} await self._ws.send(json.dumps(speak_msg)) From 76c781d129768b7a5627b870032a0eb539065694 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 7 Apr 2026 16:29:34 +0000 Subject: [PATCH 16/18] fix: remove dual finalization path, dead config code, simplify - remove duplicate _finalize_request on empty EVENT_TTS_RESPONSE. rely solely on EVENT_TTS_END to close requests, avoiding risk of double-finalization. - remove dead to_str() branch checking params['api_key'] after update_params() already deletes it. - simplify _ensure_dict to only handle dict and fallback to empty. --- .../agents/ten_packages/extension/deepgram_tts/config.py | 7 +------ .../ten_packages/extension/deepgram_tts/extension.py | 9 +-------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/config.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/config.py index 901b2eb449..cff5587242 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/config.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/config.py @@ -58,11 +58,8 @@ def to_str(self, sensitive_handling: bool = True) -> str: config = copy.deepcopy(self) - # Encrypt sensitive fields if config.api_key: config.api_key = utils.encrypt(config.api_key) - if config.params and "api_key" in config.params: - config.params["api_key"] = utils.encrypt(config.params["api_key"]) return f"{config}" @@ -70,6 +67,4 @@ def to_str(self, sensitive_handling: bool = True) -> str: def _ensure_dict(value: Any) -> dict[str, Any]: if isinstance(value, dict): return value - if value is None: - return {} - return dict(value) + return {} diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py index 20129d6f9e..85190a3de7 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py @@ -298,14 +298,7 @@ async def _process_tts_text(self, text: str, t: TTSTextInput) -> None: await self._write_dump(data_msg) await self.send_tts_audio_data(data_msg) else: - self.ten_env.log_debug( - "Received empty payload for TTS response" - ) - if t.text_input_end: - await self._finalize_request( - TTSAudioEndReason.REQUEST_END - ) - break + self.ten_env.log_debug("Empty payload, ignoring") elif event_status == EVENT_TTS_TTFB_METRIC: if data_msg is not None and isinstance(data_msg, int): From 09265e3b50fd14cb44a1fc0c09f8992df2feb997 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 8 Apr 2026 10:56:31 +0000 Subject: [PATCH 17/18] feat: add vendor params passthrough to deepgram websocket URL forward additional deepgram query parameters from config.params through to the websocket connection string. known keys (api_key, base_url, model, encoding, sample_rate) are normalized onto the config object; remaining scalar keys are appended to the query string via urlencode. - replace f-string URL building with urlencode for correctness - improve TTS_END logging to distinguish final vs intermediate events - add test_params_passthrough unit test for URL construction - bump version to 0.1.1 --- .../extension/deepgram_tts/README.md | 8 +++- .../extension/deepgram_tts/deepgram_tts.py | 21 ++++++++--- .../extension/deepgram_tts/extension.py | 12 ++++-- .../extension/deepgram_tts/manifest.json | 2 +- .../deepgram_tts/tests/test_params.py | 37 ++++++++++++++++++- 5 files changed, 67 insertions(+), 13 deletions(-) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/README.md b/ai_agents/agents/ten_packages/extension/deepgram_tts/README.md index c8be961b39..ab18a5b30b 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/README.md +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/README.md @@ -22,6 +22,7 @@ A TEN Framework extension that provides Text-to-Speech (TTS) capabilities using | `params.encoding` | string | `linear16` | Audio encoding format | | `params.sample_rate` | int | `24000` | Output sample rate in Hz | | `params.base_url` | string | `wss://api.deepgram.com/v1/speak` | WebSocket endpoint | +| `params.` | scalar | Optional | Additional Deepgram websocket query parameters passed through to the vendor | | `dump` | bool | `false` | Enable audio dumping | | `dump_path` | string | `/tmp` | Path for audio dump files | @@ -33,13 +34,18 @@ A TEN Framework extension that provides Text-to-Speech (TTS) capabilities using "api_key": "${env:DEEPGRAM_API_KEY}", "model": "aura-2-thalia-en", "encoding": "linear16", - "sample_rate": 24000 + "sample_rate": 24000, + "container": "none" }, "dump": false, "dump_path": "/tmp" } ``` +Known extension-owned keys such as `api_key`, `base_url`, `model`, `encoding`, +and `sample_rate` are normalized onto the config object. Any remaining scalar +keys under `params` are appended to the Deepgram websocket query string. + ## Available Voice Models Deepgram Aura-2 voices: diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py index cf82a000f0..929e60fa1a 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py @@ -7,6 +7,7 @@ import json from datetime import datetime from typing import AsyncIterator +from urllib.parse import urlencode import websockets from websockets.asyncio.client import ClientConnection @@ -65,12 +66,20 @@ def __init__( def _build_ws_url(self) -> str: base = self.config.base_url - params = ( - f"model={self.config.model}" - f"&encoding={self.config.encoding}" - f"&sample_rate={self.config.sample_rate}" - ) - return f"{base}?{params}" + query_params: dict[str, str | int | float | bool] = { + "model": self.config.model, + "encoding": self.config.encoding, + "sample_rate": self.config.sample_rate, + } + + # Forward any additional Deepgram vendor params through the websocket + # query string while keeping auth and endpoint configuration out of it. + for key, value in self.config.params.items(): + if key in {"api_key", "base_url"} or value is None: + continue + query_params[key] = value + + return f"{base}?{urlencode(query_params, doseq=True)}" async def start(self) -> None: """Preheat: establish initial connection.""" diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py index 85190a3de7..0f37ebad70 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py @@ -320,11 +320,17 @@ async def _process_tts_text(self, text: str, t: TTSTextInput) -> None: ) elif event_status == EVENT_TTS_END: - self.ten_env.log_info( - "Received TTS_END event from Deepgram TTS" - ) if t.text_input_end: + self.ten_env.log_info( + f"Received final TTS_END event from Deepgram TTS " + f"for request_id: {t.request_id}" + ) await self._finalize_request(TTSAudioEndReason.REQUEST_END) + else: + self.ten_env.log_debug( + f"Received intermediate TTS_END event from " + f"Deepgram TTS for request_id: {t.request_id}" + ) break elif event_status == EVENT_TTS_ERROR: diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/manifest.json b/ai_agents/agents/ten_packages/extension/deepgram_tts/manifest.json index c2ef9bb7a0..ffaceacaa3 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/manifest.json +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/manifest.json @@ -1,7 +1,7 @@ { "type": "extension", "name": "deepgram_tts", - "version": "0.1.0", + "version": "0.1.1", "dependencies": [ { "type": "system", diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_params.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_params.py index aded961fde..48ed8fe1b6 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_params.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/tests/test_params.py @@ -4,7 +4,8 @@ # See the LICENSE file for more information. # import json -from unittest.mock import patch, AsyncMock +from urllib.parse import parse_qs, urlparse +from unittest.mock import patch, AsyncMock, MagicMock from ten_runtime import ( @@ -18,7 +19,8 @@ EVENT_TTS_END, EVENT_TTS_TTFB_METRIC, ) -from unittest.mock import MagicMock +from deepgram_tts.config import DeepgramTTSConfig +from deepgram_tts.deepgram_tts import DeepgramTTSClient def create_mock_client(): @@ -38,6 +40,37 @@ async def mock_get(text): return mock +def test_params_passthrough(): + """Additional Deepgram params should be appended to the websocket URL.""" + config = DeepgramTTSConfig( + params={ + "api_key": "test_api_key", + "base_url": "wss://api.deepgram.com/v1/speak", + "model": "aura-2-thalia-en", + "encoding": "linear16", + "sample_rate": 24000, + "bit_rate": 64000, + "container": "none", + } + ) + config.update_params() + + client = DeepgramTTSClient(config=config, ten_env=MagicMock()) + parsed = urlparse(client._ws_url) + query = parse_qs(parsed.query) + + assert parsed.scheme == "wss" + assert parsed.netloc == "api.deepgram.com" + assert parsed.path == "/v1/speak" + assert query["model"] == ["aura-2-thalia-en"] + assert query["encoding"] == ["linear16"] + assert query["sample_rate"] == ["24000"] + assert query["bit_rate"] == ["64000"] + assert query["container"] == ["none"] + assert "api_key" not in query + assert "base_url" not in query + + # ================ test different sample rates ================ class ExtensionTesterSampleRate(ExtensionTester): def __init__(self, sample_rate: int): From 13b76a1ef68a65a87a0c0e3b8f06c8f54c59b450 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 8 Apr 2026 12:16:40 +0000 Subject: [PATCH 18/18] fix: add clarifying comments for event constant gap and sent_ts overwrite --- .../ten_packages/extension/deepgram_tts/deepgram_tts.py | 3 ++- .../agents/ten_packages/extension/deepgram_tts/extension.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py index 929e60fa1a..57c69132f3 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/deepgram_tts.py @@ -17,7 +17,8 @@ from ten_runtime import AsyncTenEnv from ten_ai_base.const import LOG_CATEGORY_VENDOR -# Event types communicated back to the extension +# Event types communicated back to the extension. +# 4 is reserved (used by other TTS extensions for flush events). EVENT_TTS_RESPONSE = 1 EVENT_TTS_END = 2 EVENT_TTS_ERROR = 3 diff --git a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py index 0f37ebad70..aee0b9f1e8 100644 --- a/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py +++ b/ai_agents/agents/ten_packages/extension/deepgram_tts/extension.py @@ -302,6 +302,10 @@ async def _process_tts_text(self, text: str, t: TTSTextInput) -> None: elif event_status == EVENT_TTS_TTFB_METRIC: if data_msg is not None and isinstance(data_msg, int): + # Overwrite sent_ts to audio-start time so that + # _current_request_interval_ms() measures streaming + # duration (first audio → last audio), not total + # request time. This matches the HTTP base class. self.sent_ts = datetime.now() ttfb = data_msg await self.send_tts_audio_start(