diff --git a/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_input.py b/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_input.py index 0c6f867f41..ada9c586a7 100644 --- a/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_input.py +++ b/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_input.py @@ -356,6 +356,7 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: # Get request_total_audio_duration_ms received_audio_duration_ms, _ = data.get_property_int("request_total_audio_duration_ms") + received_event_interval_ms, _ = data.get_property_int("request_event_interval_ms") # Calculate PCM duration based on current request audio bytes # Use current_request_audio_bytes which is already updated by audio frames @@ -376,6 +377,22 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: ten_env.log_info( f"Skipping audio duration validation - PCM: {pcm_audio_duration_ms}ms, Reported: {received_audio_duration_ms}ms" ) + + if received_event_interval_ms > 0: + event_interval_diff = abs(received_event_interval_ms - actual_duration_ms) + if event_interval_diff > AUDIO_DURATION_TOLERANCE_MS: + self._stop_test_with_error( + ten_env, + f"Event interval mismatch. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms", + ) + return + ten_env.log_info( + f"✅ Event interval validation passed. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms" + ) + else: + ten_env.log_info( + f"Skipping event interval validation - Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms" + ) ten_env.log_info(f"Actual event duration: {actual_duration_ms:.2f}ms") else: @@ -505,4 +522,3 @@ def test_append_input(extension_name: str, config_dir: str) -> None: assert ( error is None ), f"Test failed: {error.error_message() if error else 'Unknown error'}" - diff --git a/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_input_stress.py b/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_input_stress.py index 5c4e7c3b1c..181896ef63 100644 --- a/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_input_stress.py +++ b/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_input_stress.py @@ -22,6 +22,7 @@ import random import uuid import string +import asyncio TTS_DUMP_CONFIG_FILE = "property_dump.json" AUDIO_DURATION_TOLERANCE_MS = 50 @@ -137,7 +138,7 @@ async def on_start(self, ten_env: AsyncTenEnvTester) -> None: ten_env.log_info(f"Sending request {request_idx + 1}/{self.expected_group_count}: {text} (request_id: {request_id})") # Each request has only one text, so text_input_end is always True - time.sleep(5) + await asyncio.sleep(5) await self._send_tts_text_input( ten_env, text, request_id, metadata, text_input_end=True ) @@ -351,6 +352,7 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: # Get request_total_audio_duration_ms received_audio_duration_ms, _ = data.get_property_int("request_total_audio_duration_ms") + received_event_interval_ms, _ = data.get_property_int("request_event_interval_ms") # Calculate PCM duration based on current request audio bytes # Use current_request_audio_bytes which is already updated by audio frames @@ -371,6 +373,22 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: ten_env.log_info( f"Skipping audio duration validation - PCM: {pcm_audio_duration_ms}ms, Reported: {received_audio_duration_ms}ms" ) + + if received_event_interval_ms > 0: + event_interval_diff = abs(received_event_interval_ms - actual_duration_ms) + if event_interval_diff > AUDIO_DURATION_TOLERANCE_MS: + self._stop_test_with_error( + ten_env, + f"Event interval mismatch. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms", + ) + return + ten_env.log_info( + f"✅ Event interval validation passed. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms" + ) + else: + ten_env.log_info( + f"Skipping event interval validation - Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms" + ) ten_env.log_info(f"Actual event duration: {actual_duration_ms:.2f}ms") else: @@ -554,4 +572,3 @@ def test_append_input_stress( assert ( error is None ), f"Test failed: {error.error_message() if error else 'Unknown error'}" - diff --git a/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_input_without_text_input_end.py b/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_input_without_text_input_end.py index 022d164dbb..ae48331ef3 100644 --- a/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_input_without_text_input_end.py +++ b/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_input_without_text_input_end.py @@ -426,6 +426,9 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: received_audio_duration_ms, _ = data.get_property_int( "request_total_audio_duration_ms" ) + received_event_interval_ms, _ = data.get_property_int( + "request_event_interval_ms" + ) # Calculate PCM duration based on current request audio bytes # Use current_request_audio_bytes which is already updated by audio frames @@ -451,6 +454,22 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: f"Skipping audio duration validation - PCM: {pcm_audio_duration_ms}ms, Reported: {received_audio_duration_ms}ms" ) + if received_event_interval_ms > 0: + event_interval_diff = abs(received_event_interval_ms - actual_duration_ms) + if event_interval_diff > AUDIO_DURATION_TOLERANCE_MS: + self._stop_test_with_error( + ten_env, + f"Event interval mismatch. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms", + ) + return + ten_env.log_info( + f"✅ Event interval validation passed. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms" + ) + else: + ten_env.log_info( + f"Skipping event interval validation - Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms" + ) + ten_env.log_info(f"Actual event duration: {actual_duration_ms:.2f}ms") else: ten_env.log_warn("tts_audio_start not received before tts_audio_end") diff --git a/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_interrupt.py b/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_interrupt.py index 1eaab8a5b2..898d00b435 100644 --- a/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_interrupt.py +++ b/ai_agents/agents/integration_tests/tts_guarder/tests/test_append_interrupt.py @@ -501,6 +501,7 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: # Get request_total_audio_duration_ms received_audio_duration_ms, _ = data.get_property_int("request_total_audio_duration_ms") + received_event_interval_ms, _ = data.get_property_int("request_event_interval_ms") # Calculate PCM duration based on current request audio bytes # Use current_request_audio_bytes which is already updated by audio frames @@ -521,6 +522,22 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: ten_env.log_info( f"Skipping audio duration validation - PCM: {pcm_audio_duration_ms}ms, Reported: {received_audio_duration_ms}ms" ) + + if received_event_interval_ms > 0: + event_interval_diff = abs(received_event_interval_ms - actual_duration_ms) + if event_interval_diff > AUDIO_DURATION_TOLERANCE_MS: + self._stop_test_with_error( + ten_env, + f"Event interval mismatch. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms", + ) + return + ten_env.log_info( + f"✅ Event interval validation passed. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms" + ) + else: + ten_env.log_info( + f"Skipping event interval validation - Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms" + ) ten_env.log_info(f"Actual event duration: {actual_duration_ms:.2f}ms") else: @@ -797,4 +814,3 @@ def test_append_interrupt(extension_name: str, config_dir: str) -> None: assert ( error is None ), f"Test failed: {error.error_message() if error else 'Unknown error'}" - diff --git a/ai_agents/agents/integration_tests/tts_guarder/tests/test_basic_audio_setting.py b/ai_agents/agents/integration_tests/tts_guarder/tests/test_basic_audio_setting.py index 0fbc40ac8d..c9f2e07d4b 100644 --- a/ai_agents/agents/integration_tests/tts_guarder/tests/test_basic_audio_setting.py +++ b/ai_agents/agents/integration_tests/tts_guarder/tests/test_basic_audio_setting.py @@ -251,6 +251,7 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: # Get request_total_audio_duration_ms (actual audio duration) received_audio_duration_ms, _ = data.get_property_int("request_total_audio_duration_ms") + received_event_interval_ms, _ = data.get_property_int("request_event_interval_ms") # Validate audio duration: request_total_audio_duration_ms should be consistent with the length calculated from the PCM file pcm_audio_duration_ms = self._calculate_pcm_audio_duration_ms() @@ -262,6 +263,15 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: ten_env.log_info(f"✅ [{self.test_name}] Audio duration validation passed. PCM: {pcm_audio_duration_ms}ms, Reported: {received_audio_duration_ms}ms, Diff: {audio_duration_diff}ms") else: ten_env.log_info(f"[{self.test_name}] Skipping audio duration validation - PCM: {pcm_audio_duration_ms}ms, Reported: {received_audio_duration_ms}ms") + + if received_event_interval_ms > 0: + event_interval_diff = abs(received_event_interval_ms - actual_duration_ms) + if event_interval_diff > AUDIO_DURATION_TOLERANCE_MS: + self._stop_test_with_error(ten_env, f"Event interval mismatch. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms") + return + ten_env.log_info(f"✅ [{self.test_name}] Event interval validation passed. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms") + else: + ten_env.log_info(f"[{self.test_name}] Skipping event interval validation - Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms") # Record actual elapsed time (for debugging) ten_env.log_info(f"[{self.test_name}] Actual event duration: {actual_duration_ms:.2f}ms") diff --git a/ai_agents/agents/integration_tests/tts_guarder/tests/test_corner_input.py b/ai_agents/agents/integration_tests/tts_guarder/tests/test_corner_input.py index 17955c2a01..eb3b0b195a 100644 --- a/ai_agents/agents/integration_tests/tts_guarder/tests/test_corner_input.py +++ b/ai_agents/agents/integration_tests/tts_guarder/tests/test_corner_input.py @@ -234,6 +234,7 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: # Get request_total_audio_duration_ms (actual audio duration) received_audio_duration_ms, _ = data.get_property_int("request_total_audio_duration_ms") + received_event_interval_ms, _ = data.get_property_int("request_event_interval_ms") # Validate audio duration: request_total_audio_duration_ms should be consistent with the length calculated from the PCM file pcm_audio_duration_ms = self._calculate_pcm_audio_duration_ms() @@ -245,6 +246,15 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: ten_env.log_info(f"✅ Audio duration validation passed. PCM: {pcm_audio_duration_ms}ms, Reported: {received_audio_duration_ms}ms, Diff: {audio_duration_diff}ms") else: ten_env.log_info(f"Skipping audio duration validation - PCM: {pcm_audio_duration_ms}ms, Reported: {received_audio_duration_ms}ms") + + if received_event_interval_ms > 0: + event_interval_diff = abs(received_event_interval_ms - actual_duration_ms) + if event_interval_diff > AUDIO_DURATION_TOLERANCE_MS: + self._stop_test_with_error(ten_env, f"Event interval mismatch. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms") + return + ten_env.log_info(f"✅ Event interval validation passed. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms") + else: + ten_env.log_info(f"Skipping event interval validation - Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms") # Record actual elapsed time (for debugging) ten_env.log_info(f"Actual event duration: {actual_duration_ms:.2f}ms") diff --git a/ai_agents/agents/integration_tests/tts_guarder/tests/test_dump.py b/ai_agents/agents/integration_tests/tts_guarder/tests/test_dump.py index 30bcc56774..193fafbe34 100644 --- a/ai_agents/agents/integration_tests/tts_guarder/tests/test_dump.py +++ b/ai_agents/agents/integration_tests/tts_guarder/tests/test_dump.py @@ -235,6 +235,7 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: # Get request_total_audio_duration_ms (actual audio duration) received_audio_duration_ms, _ = data.get_property_int("request_total_audio_duration_ms") + received_event_interval_ms, _ = data.get_property_int("request_event_interval_ms") # Validate audio duration: request_total_audio_duration_ms should be consistent with the length calculated from the PCM file pcm_audio_duration_ms = self._calculate_pcm_audio_duration_ms() @@ -246,6 +247,15 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: ten_env.log_info(f"✅ Audio duration validation passed. PCM: {pcm_audio_duration_ms}ms, Reported: {received_audio_duration_ms}ms, Diff: {audio_duration_diff}ms") else: ten_env.log_info(f"Skipping audio duration validation - PCM: {pcm_audio_duration_ms}ms, Reported: {received_audio_duration_ms}ms") + + if received_event_interval_ms > 0: + event_interval_diff = abs(received_event_interval_ms - actual_duration_ms) + if event_interval_diff > AUDIO_DURATION_TOLERANCE_MS: + self._stop_test_with_error(ten_env, f"Event interval mismatch. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms") + return + ten_env.log_info(f"✅ Event interval validation passed. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms") + else: + ten_env.log_info(f"Skipping event interval validation - Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms") # Record actual elapsed time (for debugging) ten_env.log_info(f"Actual event duration: {actual_duration_ms:.2f}ms") diff --git a/ai_agents/agents/integration_tests/tts_guarder/tests/test_dump_each_request_id.py b/ai_agents/agents/integration_tests/tts_guarder/tests/test_dump_each_request_id.py index eb86bb4786..007b8c2824 100644 --- a/ai_agents/agents/integration_tests/tts_guarder/tests/test_dump_each_request_id.py +++ b/ai_agents/agents/integration_tests/tts_guarder/tests/test_dump_each_request_id.py @@ -236,6 +236,7 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: # Get request_total_audio_duration_ms (actual audio duration) received_audio_duration_ms, _ = data.get_property_int("request_total_audio_duration_ms") + received_event_interval_ms, _ = data.get_property_int("request_event_interval_ms") # Validate audio duration: request_total_audio_duration_ms should be consistent with the length calculated from the PCM file pcm_audio_duration_ms = self._calculate_pcm_audio_duration_ms() @@ -247,6 +248,15 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: ten_env.log_info(f"✅ Audio duration validation passed. PCM: {pcm_audio_duration_ms}ms, Reported: {received_audio_duration_ms}ms, Diff: {audio_duration_diff}ms") else: ten_env.log_info(f"Skipping audio duration validation - PCM: {pcm_audio_duration_ms}ms, Reported: {received_audio_duration_ms}ms") + + if received_event_interval_ms > 0: + event_interval_diff = abs(received_event_interval_ms - actual_duration_ms) + if event_interval_diff > AUDIO_DURATION_TOLERANCE_MS: + self._stop_test_with_error(ten_env, f"Event interval mismatch. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms") + return + ten_env.log_info(f"✅ Event interval validation passed. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms") + else: + ten_env.log_info(f"Skipping event interval validation - Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms") # Record actual elapsed time (for debugging) ten_env.log_info(f"Actual event duration: {actual_duration_ms:.2f}ms") diff --git a/ai_agents/agents/integration_tests/tts_guarder/tests/test_interleaved_requests.py b/ai_agents/agents/integration_tests/tts_guarder/tests/test_interleaved_requests.py index 6d3962271d..01f6839ec0 100644 --- a/ai_agents/agents/integration_tests/tts_guarder/tests/test_interleaved_requests.py +++ b/ai_agents/agents/integration_tests/tts_guarder/tests/test_interleaved_requests.py @@ -437,6 +437,7 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: # Get request_total_audio_duration_ms received_audio_duration_ms, _ = data.get_property_int("request_total_audio_duration_ms") + received_event_interval_ms, _ = data.get_property_int("request_event_interval_ms") # Calculate PCM duration based on current request audio bytes # Use current_request_audio_bytes which is already updated by audio frames @@ -457,6 +458,22 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: ten_env.log_info( f"Skipping audio duration validation - PCM: {pcm_audio_duration_ms}ms, Reported: {received_audio_duration_ms}ms" ) + + if received_event_interval_ms > 0: + event_interval_diff = abs(received_event_interval_ms - actual_duration_ms) + if event_interval_diff > AUDIO_DURATION_TOLERANCE_MS: + self._stop_test_with_error( + ten_env, + f"Event interval mismatch. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms", + ) + return + ten_env.log_info( + f"✅ Event interval validation passed. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms" + ) + else: + ten_env.log_info( + f"Skipping event interval validation - Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms" + ) ten_env.log_info(f"Actual event duration: {actual_duration_ms:.2f}ms") else: @@ -596,4 +613,3 @@ def test_interleaved_requests(extension_name: str, config_dir: str) -> None: error is None ), f"Test failed: {error.error_message() if error else 'Unknown error'}" - diff --git a/ai_agents/agents/integration_tests/tts_guarder/tests/test_metrics.py b/ai_agents/agents/integration_tests/tts_guarder/tests/test_metrics.py index a9df0e4de1..ee9f99b6c8 100644 --- a/ai_agents/agents/integration_tests/tts_guarder/tests/test_metrics.py +++ b/ai_agents/agents/integration_tests/tts_guarder/tests/test_metrics.py @@ -234,6 +234,7 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: # Get request_total_audio_duration_ms (actual audio duration) received_audio_duration_ms, _ = data.get_property_int("request_total_audio_duration_ms") + received_event_interval_ms, _ = data.get_property_int("request_event_interval_ms") # Validate audio duration: request_total_audio_duration_ms should be consistent with the length calculated from the PCM file pcm_audio_duration_ms = self._calculate_pcm_audio_duration_ms() @@ -245,6 +246,15 @@ async def on_data(self, ten_env: AsyncTenEnvTester, data: Data) -> None: ten_env.log_info(f"✅ Audio duration validation passed. PCM: {pcm_audio_duration_ms}ms, Reported: {received_audio_duration_ms}ms, Diff: {audio_duration_diff}ms") else: ten_env.log_info(f"Skipping audio duration validation - PCM: {pcm_audio_duration_ms}ms, Reported: {received_audio_duration_ms}ms") + + if received_event_interval_ms > 0: + event_interval_diff = abs(received_event_interval_ms - actual_duration_ms) + if event_interval_diff > AUDIO_DURATION_TOLERANCE_MS: + self._stop_test_with_error(ten_env, f"Event interval mismatch. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms") + return + ten_env.log_info(f"✅ Event interval validation passed. Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms, Diff: {event_interval_diff:.2f}ms") + else: + ten_env.log_info(f"Skipping event interval validation - Actual: {actual_duration_ms:.2f}ms, Reported: {received_event_interval_ms}ms") # Record actual elapsed time (for debugging) ten_env.log_info(f"Actual event duration: {actual_duration_ms:.2f}ms") diff --git a/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/README.md b/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/README.md new file mode 100644 index 0000000000..3b36ba6c94 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/README.md @@ -0,0 +1,51 @@ +# OpenAI TTS Proxy Server + +A simple HTTP proxy server that forwards requests from TTS Client to OpenAI API and streams responses back. + +## Features + +- **Transparent Proxy**: Forwards all requests and responses between TTS Client and OpenAI API +- **Streaming Support**: Efficiently streams audio data without buffering +- **Header Forwarding**: Preserves all request headers, especially Authorization +- **Connection Pooling**: Uses httpx with connection pooling for better performance +- **Health Check**: Provides `/health` endpoint for monitoring + +## Usage + +### Start the Proxy Server + +```bash +cd proxy +pip install -r requirements.txt +python proxy_server.py +``` + +Or with environment variables: + +```bash +PROXY_HOST=0.0.0.0 PROXY_PORT=8081 OPENAI_BASE_URL=https://api.openai.com/v1 python proxy_server.py +``` + +### Environment Variables + +- `PROXY_HOST`: Host to bind to (default: `0.0.0.0`) +- `PROXY_PORT`: Port to listen on (default: `8081`) +- `OPENAI_BASE_URL`: OpenAI API base URL (default: `https://api.openai.com/v1`) +- `OPENAI_API_KEY`: OpenAI API key (optional, can be provided via Authorization header) + +The proxy will forward requests to OpenAI using the API key from either: +1. The `Authorization` header in the request (preferred) +2. The `OPENAI_API_KEY` environment variable (fallback) + +## API Endpoints + +- `POST /audio/speech`: Proxy endpoint that forwards to OpenAI `/audio/speech` +- `GET /health`: Health check endpoint + +## Docker Usage + +When running in a container with TTS Client: + +1. Start the proxy server as a background process or separate service +2. Configure TTS Client to send request to `http://localhost:8081` +3. Both services will run in the same container and communicate via localhost diff --git a/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/__init__.py b/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/__init__.py new file mode 100644 index 0000000000..1e2415b0f1 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/__init__.py @@ -0,0 +1,3 @@ +""" +OpenAI TTS Proxy Server Package +""" diff --git a/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/docker-example.sh b/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/docker-example.sh new file mode 100644 index 0000000000..3062887cec --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/docker-example.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Example: How to run proxy server in a container with openai_tts + +# Start proxy server in background +echo "Starting OpenAI TTS Proxy on port 8081..." +python /app/proxy/proxy_server.py & +PROXY_PID=$! + +# Wait for proxy to be ready +sleep 2 + +# Check if proxy is running +if ! kill -0 $PROXY_PID 2>/dev/null; then + echo "Error: Proxy server failed to start" + exit 1 +fi + +echo "Proxy server started (PID: $PROXY_PID)" + +# Your main application would start here +# For example, starting a TEN agent that uses openai_tts2_python +# with base_url configured to http://localhost:8081 + +# Wait for proxy process +wait $PROXY_PID diff --git a/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/proxy_server.py b/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/proxy_server.py new file mode 100644 index 0000000000..be9af5273a --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/proxy_server.py @@ -0,0 +1,317 @@ +""" +OpenAI TTS Proxy Server + +A simple HTTP proxy server that forwards requests from openai_tts to OpenAI API +and streams responses back. Designed to run independently in the same container. +""" + +import os +import logging +import json +from contextlib import asynccontextmanager +from fastapi import FastAPI, Request, Response +from fastapi.responses import StreamingResponse +import httpx + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger(__name__) + +# Create a single httpx client for connection pooling +http_client = None + + +@asynccontextmanager +async def lifespan(_app: FastAPI): + """Lifespan context manager for FastAPI app.""" + # Startup + global http_client # pylint: disable=global-statement + http_client = httpx.AsyncClient( + timeout=httpx.Timeout(timeout=60.0), + limits=httpx.Limits( + max_connections=100, + max_keepalive_connections=20, + keepalive_expiry=600.0, + ), + http2=True, + ) + logger.info("HTTP client initialized") + yield + # Shutdown + if http_client: + await http_client.aclose() + logger.info("HTTP client closed") + logger.info("Proxy server shutdown complete") + + +app = FastAPI(title="OpenAI TTS Proxy", lifespan=lifespan) + +# Configuration +PROXY_HOST = os.getenv("PROXY_HOST", "0.0.0.0") +PROXY_PORT = int(os.getenv("PROXY_PORT", "8081")) +OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1") +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") + + +async def _proxy_to_openai(request: Request): + """ + Internal function to proxy requests to OpenAI /audio/speech endpoint. + + Forwards the request body and headers (especially Authorization) to OpenAI, + then streams the response back to the client. + """ + try: + # Read request body + body = await request.body() + + # Get request headers + headers = dict(request.headers) + + # Log incoming request details + logger.info("=" * 80) + logger.info("INCOMING REQUEST:") + logger.info(f" Method: {request.method}") + logger.info(f" URL: {request.url}") + logger.info(f" Headers: {dict(headers)}") + + # Try to parse and log request body as JSON + try: + body_json = json.loads(body.decode("utf-8")) + # Mask sensitive fields + body_json_log = body_json.copy() + if "api_key" in body_json_log: + body_json_log["api_key"] = "***MASKED***" + logger.info(f" Body (JSON): {json.dumps(body_json_log, indent=2)}") + except Exception: + logger.info(f" Body (raw): {body[:200]}... (first 200 bytes)") + + # Remove headers that shouldn't be forwarded + headers.pop("host", None) + headers.pop("content-length", None) # Let httpx calculate it + headers.pop("connection", None) + headers.pop("upgrade", None) + + # Use OpenAI API key from environment if Authorization header is not present + if "authorization" not in headers and OPENAI_API_KEY: + headers["authorization"] = f"Bearer {OPENAI_API_KEY}" + + # Log Authorization header (masked) + if "authorization" in headers: + auth_header = headers["authorization"] + if len(auth_header) > 20: + masked_auth = ( + auth_header[:10] + "***MASKED***" + auth_header[-10:] + ) + else: + masked_auth = "***MASKED***" + logger.info(f" Authorization: {masked_auth}") + + # Build target URL + target_url = f"{OPENAI_BASE_URL}/audio/speech" + + logger.info(f"FORWARDING TO: {target_url}") + logger.info(f" Forwarding headers: {dict(headers)}") + logger.info("=" * 80) + + # Forward request to OpenAI with streaming + # Note: We need to keep the stream context alive during the entire streaming process + stream_context = http_client.stream( + "POST", + target_url, + headers=headers, + content=body, + ) + response = ( + await stream_context.__aenter__() + ) # pylint: disable=no-member + + try: + # Log response details + logger.info("=" * 80) + logger.info("OPENAI RESPONSE:") + logger.info(f" Status Code: {response.status_code}") + logger.info(f" Response Headers: {dict(response.headers)}") + + # Check for errors + if response.status_code != 200: + error_body = await response.aread() + await stream_context.__aexit__( + None, None, None + ) # pylint: disable=no-member + logger.error( + f"OpenAI API error: {response.status_code}, body: {error_body.decode('utf-8', errors='replace')[:200]}" + ) + return Response( + content=error_body, + status_code=response.status_code, + headers={ + key: value + for key, value in response.headers.items() + if key.lower() in ["content-type"] + }, + ) + + # Prepare essential headers + headers_to_exclude = [ + "content-encoding", + "transfer-encoding", + "connection", + "date", + "set-cookie", + "server", + "cf-ray", + "cf-cache-status", + "alt-svc", + "strict-transport-security", + "x-content-type-options", + "access-control-expose-headers", + ] + + essential_headers = {} + for key, value in response.headers.items(): + key_lower = key.lower() + if key_lower not in headers_to_exclude: + if ( + key_lower in ["content-type"] + or key_lower.startswith("openai-") + or key_lower.startswith("x-") + ): + essential_headers[key] = value + + if "content-type" not in essential_headers: + essential_headers["content-type"] = response.headers.get( + "content-type", "audio/pcm" + ) + + # Stream the response back to client + total_bytes = 0 + chunk_count = 0 + + async def generate(): + nonlocal total_bytes, chunk_count + try: + logger.info("Starting to stream response chunks...") + async for raw_chunk in response.aiter_raw(): + if raw_chunk: + chunk_count += 1 + total_bytes += len(raw_chunk) + if chunk_count <= 5 or chunk_count % 10 == 0: + logger.info( + f" Chunk #{chunk_count}: {len(raw_chunk)} bytes (total: {total_bytes} bytes)" + ) + yield raw_chunk + logger.info( + f"Streaming complete: {chunk_count} chunks, {total_bytes} total bytes" + ) + except Exception as e: + error_msg = str(e) + error_type = type(e).__name__ + logger.warning( + f"Streaming interrupted: {error_type}: {error_msg}" + ) + logger.warning( + f" Streamed {chunk_count} chunks, {total_bytes} bytes before interruption" + ) + + if ( + "StreamClosed" in error_type + or "stream has been closed" in error_msg.lower() + or "streamclosed" in error_msg.lower() + ): + logger.debug( + "Client disconnected early, stopping stream gracefully" + ) + return + finally: + # Clean up the stream context when done + try: + await stream_context.__aexit__( + None, None, None + ) # pylint: disable=no-member + except Exception: + pass + + logger.info( + f"Returning StreamingResponse with headers: {essential_headers}" + ) + logger.info("=" * 80) + + return StreamingResponse( + generate(), + status_code=response.status_code, + headers=essential_headers, + ) + except Exception: + # Clean up on error + try: + await stream_context.__aexit__( + None, None, None + ) # pylint: disable=no-member + except Exception: + pass + raise + + except httpx.HTTPStatusError as e: + logger.error( + f"HTTP status error while proxying: {e.response.status_code} - {e}" + ) + return Response( + content=f"Proxy error: {str(e)}", + status_code=( + e.response.status_code if hasattr(e, "response") else 500 + ), + ) + except httpx.HTTPError as e: + logger.error(f"HTTP error while proxying: {e}") + return Response( + content=f"Proxy error: {str(e)}", + status_code=500, + ) + except Exception as e: + logger.error(f"Unexpected error while proxying: {e}", exc_info=True) + return Response( + content=f"Proxy error: {str(e)}", + status_code=500, + ) + + +@app.post("/audio/speech") +async def proxy_audio_speech(request: Request): + """ + Proxy POST requests to OpenAI /audio/speech endpoint. + """ + return await _proxy_to_openai(request) + + +@app.post("/") +async def proxy_root(request: Request): + """ + Proxy POST requests to root path, forwarding to OpenAI /audio/speech endpoint. + + This allows clients to use http://localhost:8081 as the base URL + without needing to specify /audio/speech in the path. + """ + return await _proxy_to_openai(request) + + +@app.get("/health") +async def health_check(): + """Health check endpoint.""" + return {"status": "ok", "service": "openai-tts-proxy"} + + +if __name__ == "__main__": + import uvicorn + + logger.info(f"Starting OpenAI TTS Proxy on {PROXY_HOST}:{PROXY_PORT}") + logger.info(f"OpenAI Base URL: {OPENAI_BASE_URL}") + + uvicorn.run( + app, + host=PROXY_HOST, + port=PROXY_PORT, + log_level="info", + ) diff --git a/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/requirements.txt b/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/requirements.txt new file mode 100644 index 0000000000..80c075c39c --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/requirements.txt @@ -0,0 +1,3 @@ +fastapi>=0.104.0 +uvicorn[standard]>=0.24.0 +httpx>=0.25.0 diff --git a/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/start.sh b/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/start.sh new file mode 100644 index 0000000000..75fc6fe9d8 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/openai_tts2_python/proxy/start.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Start script for OpenAI TTS Proxy Server + +set -e + +# Get script directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# Check if virtual environment exists +if [ ! -d "venv" ]; then + echo "Creating virtual environment..." + python3 -m venv venv +fi + +# Activate virtual environment +source venv/bin/activate + +# Install dependencies +echo "Installing dependencies..." +pip install -q -r requirements.txt + +# Start proxy server +echo "Starting OpenAI TTS Proxy Server..." +python proxy_server.py