From 70af36595ae25390881d6dee788e335a3aac2fd1 Mon Sep 17 00:00:00 2001 From: Varun Nuthalapati Date: Sun, 26 Apr 2026 10:58:04 -0700 Subject: [PATCH] docs: add README for google_asr, azure_asr, and stepfun_tts extensions Add configuration documentation for three extensions that were missing README files, covering parameters, environment variables, and usage examples. Addresses #225. --- .../extension/azure_asr_python/README.md | 113 +++++++++++++++++ .../extension/google_asr_python/README.md | 120 ++++++++++++++++++ .../extension/stepfun_tts_python/README.md | 48 +++++++ 3 files changed, 281 insertions(+) create mode 100644 ai_agents/agents/ten_packages/extension/azure_asr_python/README.md create mode 100644 ai_agents/agents/ten_packages/extension/google_asr_python/README.md create mode 100644 ai_agents/agents/ten_packages/extension/stepfun_tts_python/README.md diff --git a/ai_agents/agents/ten_packages/extension/azure_asr_python/README.md b/ai_agents/agents/ten_packages/extension/azure_asr_python/README.md new file mode 100644 index 0000000000..81e85edb34 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/azure_asr_python/README.md @@ -0,0 +1,113 @@ +# Azure Speech-to-Text ASR Extension + +A TEN Framework extension for real-time speech recognition using [Azure Cognitive Services Speech SDK](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-to-text). + +## Configuration + +All parameters are configured through the `params` object in `property.json`. + +### Basic configuration + +```json +{ + "params": { + "key": "${env:AZURE_STT_KEY|}", + "region": "${env:AZURE_STT_REGION|}" + } +} +``` + +### Language and recognition settings + +```json +{ + "params": { + "key": "${env:AZURE_STT_KEY|}", + "region": "${env:AZURE_STT_REGION|}", + "language": "en-US", + "sample_rate": 16000 + } +} +``` + +### Multilingual recognition + +Pass multiple language codes as a comma-separated string: + +```json +{ + "params": { + "key": "${env:AZURE_STT_KEY|}", + "region": "${env:AZURE_STT_REGION|}", + "language": "en-US,zh-CN,ja-JP" + } +} +``` + +### Hotword boosting + +Provide hotwords to improve recognition of domain-specific terms. Use the `|boost` suffix to set the boost value (default boost is applied without a suffix): + +```json +{ + "params": { + "key": "${env:AZURE_STT_KEY|}", + "region": "${env:AZURE_STT_REGION|}", + "hotwords": ["TEN Framework|10", "tman|8", "extension"] + } +} +``` + +### Stream finalization mode + +Controls how the extension finalizes an utterance: + +```json +{ + "params": { + "key": "${env:AZURE_STT_KEY|}", + "region": "${env:AZURE_STT_REGION|}", + "finalize_mode": "mute_pkg", + "mute_pkg_duration_ms": 800 + } +} +``` + +| `finalize_mode` | Behavior | +|-----------------|----------| +| `"mute_pkg"` (default) | Finalize after `mute_pkg_duration_ms` of silence in the audio stream. | +| `"disconnect"` | Finalize on stream disconnect. | + +### Audio dump (debugging) + +```json +{ + "params": { + "key": "${env:AZURE_STT_KEY|}", + "region": "${env:AZURE_STT_REGION|}", + "dump": true, + "dump_path": "./dump/" + } +} +``` + +## Configuration Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `key` | string | `""` | Azure Speech service subscription key. | +| `region` | string | `""` | Azure region, e.g. `"eastus"`, `"westeurope"`. | +| `language` | string | `"en-US"` | Primary language code, or comma-separated list for multilingual. | +| `sample_rate` | int | `16000` | Audio sample rate in Hz. | +| `finalize_mode` | string | `"mute_pkg"` | Utterance finalization strategy: `"mute_pkg"` or `"disconnect"`. | +| `mute_pkg_duration_ms` | int | `800` | Silence duration (ms) to trigger finalization in `mute_pkg` mode. | +| `hotwords` | list | `[]` | List of hotwords for phrase-list boosting. Use `"word\|boost"` format to set boost value. | +| `dump` | bool | `false` | Dump incoming audio to a PCM file for debugging. | +| `dump_path` | string | `"."` | Directory path for audio dump files. | + +## Environment Variables + +| Variable | Description | +|----------|-------------| +| `AZURE_STT_KEY` | Azure Speech service subscription key | +| `AZURE_STT_REGION` | Azure region identifier (e.g. `eastus`) | diff --git a/ai_agents/agents/ten_packages/extension/google_asr_python/README.md b/ai_agents/agents/ten_packages/extension/google_asr_python/README.md new file mode 100644 index 0000000000..59a802c641 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/google_asr_python/README.md @@ -0,0 +1,120 @@ +# Google Cloud Speech-to-Text ASR Extension + +A TEN Framework extension for real-time speech recognition using [Google Cloud Speech-to-Text V2 API](https://cloud.google.com/speech-to-text/v2/docs). + +## Configuration + +All parameters are configured through the `params` object in `property.json`. + +### Basic configuration + +```json +{ + "params": { + "project_id": "${env:GOOGLE_ASR_PROJECT_ID|}", + "language": "en-US", + "model": "long", + "sample_rate": 16000, + "channels": 1, + "encoding": "LINEAR16", + "interim_results": true + } +} +``` + +### Authentication + +Google Cloud authentication is handled via [Application Default Credentials (ADC)](https://cloud.google.com/docs/authentication/application-default-credentials). Two options are supported: + +**Option 1 — Credentials file path:** + +```json +{ + "params": { + "project_id": "${env:GOOGLE_ASR_PROJECT_ID|}", + "adc_credentials_path": "${env:GOOGLE_APPLICATION_CREDENTIALS_PATH|}" + } +} +``` + +**Option 2 — Credentials JSON string:** + +```json +{ + "params": { + "project_id": "${env:GOOGLE_ASR_PROJECT_ID|}", + "adc_credentials_string": "${env:GOOGLE_APPLICATION_CREDENTIALS_STRING|}" + } +} +``` + +### Multilingual recognition + +Pass multiple language codes as a comma-separated string: + +```json +{ + "params": { + "language": "en-US,zh-CN,ja-JP", + "model": "long" + } +} +``` + +### Speaker diarization + +```json +{ + "params": { + "language": "en-US", + "model": "long", + "enable_speaker_diarization": true, + "diarization_speaker_count": 2 + } +} +``` + +### Audio dump (debugging) + +```json +{ + "params": { + "language": "en-US", + "dump": true, + "dump_path": "./dump/" + } +} +``` + +## Configuration Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `project_id` | string | `""` | Google Cloud Project ID. Retrieved from ADC if omitted. | +| `location` | string | `"global"` | Google Cloud location for the recognizer. | +| `adc_credentials_path` | string | `""` | Path to ADC credentials JSON file. | +| `adc_credentials_string` | string | `""` | ADC credentials as a JSON string. | +| `language` | string | `"en-US"` | Primary language code, or comma-separated list for multilingual. | +| `model` | string | `"long"` | Recognition model: `"long"`, `"short"`, `"chirp_2"`. | +| `sample_rate` | int | `16000` | Audio sample rate in Hz. | +| `channels` | int | `1` | Number of audio channels. | +| `encoding` | string | `"LINEAR16"` | Audio encoding. Supports `LINEAR16`, `MULAW`, `ALAW`, `FLAC`, `auto`. | +| `interim_results` | bool | `true` | Emit interim (partial) recognition results. | +| `enable_automatic_punctuation` | bool | `true` | Add punctuation to recognition results. | +| `enable_word_time_offsets` | bool | `true` | Include word-level timestamps. | +| `enable_speaker_diarization` | bool | `false` | Enable speaker diarization. | +| `diarization_speaker_count` | int | `0` | Number of speakers (0 = auto detect). | +| `profanity_filter` | bool | `false` | Filter profanity from results. | +| `max_retry_attempts` | int | `3` | Maximum reconnection attempts on failure. | +| `retry_delay` | float | `1.0` | Seconds between reconnection attempts. | +| `stream_max_duration` | int | `270` | Max stream duration in seconds before reconnect. | +| `dump` | bool | `false` | Dump incoming audio to a PCM file for debugging. | +| `dump_path` | string | `"."` | Directory path for audio dump files. | + +## Environment Variables + +| Variable | Description | +|----------|-------------| +| `GOOGLE_ASR_PROJECT_ID` | Google Cloud project ID | +| `GOOGLE_APPLICATION_CREDENTIALS_PATH` | Path to the service account credentials JSON file | +| `GOOGLE_APPLICATION_CREDENTIALS_STRING` | Service account credentials as a JSON string | diff --git a/ai_agents/agents/ten_packages/extension/stepfun_tts_python/README.md b/ai_agents/agents/ten_packages/extension/stepfun_tts_python/README.md new file mode 100644 index 0000000000..ccf5a1fd57 --- /dev/null +++ b/ai_agents/agents/ten_packages/extension/stepfun_tts_python/README.md @@ -0,0 +1,48 @@ +# StepFun TTS Extension + +A TEN Framework extension for text-to-speech synthesis using the [StepFun Realtime Audio API](https://platform.stepfun.com/docs/audio/realtimeaudio). + +## Configuration + +All parameters are configured through the `params` object in `property.json`. + +### Basic configuration + +```json +{ + "params": { + "api_key": "${env:STEPFUN_TTS_KEY|}", + "base_url": "wss://api.stepfun.com/v1/realtime/audio", + "model": "step-tts-mini", + "voice_id": "cixingnansheng" + } +} +``` + +### Custom voice and model + +```json +{ + "params": { + "api_key": "${env:STEPFUN_TTS_KEY|}", + "base_url": "wss://api.stepfun.com/v1/realtime/audio", + "model": "step-tts-mini", + "voice_id": "wenrouzhixin" + } +} +``` + +## Configuration Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `api_key` | string | `""` | StepFun API key. | +| `base_url` | string | `"wss://api.stepfun.com/v1/realtime/audio"` | StepFun realtime audio WebSocket endpoint. | +| `model` | string | `"step-tts-mini"` | TTS model name. | +| `voice_id` | string | `"cixingnansheng"` | Voice identifier for synthesis. | + +## Environment Variables + +| Variable | Description | +|----------|-------------| +| `STEPFUN_TTS_KEY` | StepFun API key |