From 70af36595ae25390881d6dee788e335a3aac2fd1 Mon Sep 17 00:00:00 2001
From: Varun Nuthalapati <nuthalapativarun@gmail.com>
Date: Sun, 26 Apr 2026 10:58:04 -0700
Subject: [PATCH] docs: add README for google_asr, azure_asr, and stepfun_tts
 extensions

Add configuration documentation for three extensions that were missing
README files, covering parameters, environment variables, and usage
examples. Addresses #225.
---
 .../extension/azure_asr_python/README.md      | 113 +++++++++++++++++
 .../extension/google_asr_python/README.md     | 120 ++++++++++++++++++
 .../extension/stepfun_tts_python/README.md    |  48 +++++++
 3 files changed, 281 insertions(+)
 create mode 100644 ai_agents/agents/ten_packages/extension/azure_asr_python/README.md
 create mode 100644 ai_agents/agents/ten_packages/extension/google_asr_python/README.md
 create mode 100644 ai_agents/agents/ten_packages/extension/stepfun_tts_python/README.md

diff --git a/ai_agents/agents/ten_packages/extension/azure_asr_python/README.md b/ai_agents/agents/ten_packages/extension/azure_asr_python/README.md
new file mode 100644
index 0000000000..81e85edb34
--- /dev/null
+++ b/ai_agents/agents/ten_packages/extension/azure_asr_python/README.md
@@ -0,0 +1,113 @@
+# Azure Speech-to-Text ASR Extension
+
+A TEN Framework extension for real-time speech recognition using [Azure Cognitive Services Speech SDK](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-to-text).
+
+## Configuration
+
+All parameters are configured through the `params` object in `property.json`.
+
+### Basic configuration
+
+```json
+{
+    "params": {
+        "key": "${env:AZURE_STT_KEY|}",
+        "region": "${env:AZURE_STT_REGION|}"
+    }
+}
+```
+
+### Language and recognition settings
+
+```json
+{
+    "params": {
+        "key": "${env:AZURE_STT_KEY|}",
+        "region": "${env:AZURE_STT_REGION|}",
+        "language": "en-US",
+        "sample_rate": 16000
+    }
+}
+```
+
+### Multilingual recognition
+
+Pass multiple language codes as a comma-separated string:
+
+```json
+{
+    "params": {
+        "key": "${env:AZURE_STT_KEY|}",
+        "region": "${env:AZURE_STT_REGION|}",
+        "language": "en-US,zh-CN,ja-JP"
+    }
+}
+```
+
+### Hotword boosting
+
+Provide hotwords to improve recognition of domain-specific terms. Use the `|boost` suffix to set the boost value (default boost is applied without a suffix):
+
+```json
+{
+    "params": {
+        "key": "${env:AZURE_STT_KEY|}",
+        "region": "${env:AZURE_STT_REGION|}",
+        "hotwords": ["TEN Framework|10", "tman|8", "extension"]
+    }
+}
+```
+
+### Stream finalization mode
+
+Controls how the extension finalizes an utterance:
+
+```json
+{
+    "params": {
+        "key": "${env:AZURE_STT_KEY|}",
+        "region": "${env:AZURE_STT_REGION|}",
+        "finalize_mode": "mute_pkg",
+        "mute_pkg_duration_ms": 800
+    }
+}
+```
+
+| `finalize_mode` | Behavior |
+|-----------------|----------|
+| `"mute_pkg"` (default) | Finalize after `mute_pkg_duration_ms` of silence in the audio stream. |
+| `"disconnect"` | Finalize on stream disconnect. |
+
+### Audio dump (debugging)
+
+```json
+{
+    "params": {
+        "key": "${env:AZURE_STT_KEY|}",
+        "region": "${env:AZURE_STT_REGION|}",
+        "dump": true,
+        "dump_path": "./dump/"
+    }
+}
+```
+
+## Configuration Parameters
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `key` | string | `""` | Azure Speech service subscription key. |
+| `region` | string | `""` | Azure region, e.g. `"eastus"`, `"westeurope"`. |
+| `language` | string | `"en-US"` | Primary language code, or comma-separated list for multilingual. |
+| `sample_rate` | int | `16000` | Audio sample rate in Hz. |
+| `finalize_mode` | string | `"mute_pkg"` | Utterance finalization strategy: `"mute_pkg"` or `"disconnect"`. |
+| `mute_pkg_duration_ms` | int | `800` | Silence duration (ms) to trigger finalization in `mute_pkg` mode. |
+| `hotwords` | list | `[]` | List of hotwords for phrase-list boosting. Use `"word\|boost"` format to set boost value. |
+| `dump` | bool | `false` | Dump incoming audio to a PCM file for debugging. |
+| `dump_path` | string | `"."` | Directory path for audio dump files. |
+
+## Environment Variables
+
+| Variable | Description |
+|----------|-------------|
+| `AZURE_STT_KEY` | Azure Speech service subscription key |
+| `AZURE_STT_REGION` | Azure region identifier (e.g. `eastus`) |
diff --git a/ai_agents/agents/ten_packages/extension/google_asr_python/README.md b/ai_agents/agents/ten_packages/extension/google_asr_python/README.md
new file mode 100644
index 0000000000..59a802c641
--- /dev/null
+++ b/ai_agents/agents/ten_packages/extension/google_asr_python/README.md
@@ -0,0 +1,120 @@
+# Google Cloud Speech-to-Text ASR Extension
+
+A TEN Framework extension for real-time speech recognition using [Google Cloud Speech-to-Text V2 API](https://cloud.google.com/speech-to-text/v2/docs).
+
+## Configuration
+
+All parameters are configured through the `params` object in `property.json`.
+
+### Basic configuration
+
+```json
+{
+    "params": {
+        "project_id": "${env:GOOGLE_ASR_PROJECT_ID|}",
+        "language": "en-US",
+        "model": "long",
+        "sample_rate": 16000,
+        "channels": 1,
+        "encoding": "LINEAR16",
+        "interim_results": true
+    }
+}
+```
+
+### Authentication
+
+Google Cloud authentication is handled via [Application Default Credentials (ADC)](https://cloud.google.com/docs/authentication/application-default-credentials). Two options are supported:
+
+**Option 1 — Credentials file path:**
+
+```json
+{
+    "params": {
+        "project_id": "${env:GOOGLE_ASR_PROJECT_ID|}",
+        "adc_credentials_path": "${env:GOOGLE_APPLICATION_CREDENTIALS_PATH|}"
+    }
+}
+```
+
+**Option 2 — Credentials JSON string:**
+
+```json
+{
+    "params": {
+        "project_id": "${env:GOOGLE_ASR_PROJECT_ID|}",
+        "adc_credentials_string": "${env:GOOGLE_APPLICATION_CREDENTIALS_STRING|}"
+    }
+}
+```
+
+### Multilingual recognition
+
+Pass multiple language codes as a comma-separated string:
+
+```json
+{
+    "params": {
+        "language": "en-US,zh-CN,ja-JP",
+        "model": "long"
+    }
+}
+```
+
+### Speaker diarization
+
+```json
+{
+    "params": {
+        "language": "en-US",
+        "model": "long",
+        "enable_speaker_diarization": true,
+        "diarization_speaker_count": 2
+    }
+}
+```
+
+### Audio dump (debugging)
+
+```json
+{
+    "params": {
+        "language": "en-US",
+        "dump": true,
+        "dump_path": "./dump/"
+    }
+}
+```
+
+## Configuration Parameters
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `project_id` | string | `""` | Google Cloud Project ID. Retrieved from ADC if omitted. |
+| `location` | string | `"global"` | Google Cloud location for the recognizer. |
+| `adc_credentials_path` | string | `""` | Path to ADC credentials JSON file. |
+| `adc_credentials_string` | string | `""` | ADC credentials as a JSON string. |
+| `language` | string | `"en-US"` | Primary language code, or comma-separated list for multilingual. |
+| `model` | string | `"long"` | Recognition model: `"long"`, `"short"`, `"chirp_2"`. |
+| `sample_rate` | int | `16000` | Audio sample rate in Hz. |
+| `channels` | int | `1` | Number of audio channels. |
+| `encoding` | string | `"LINEAR16"` | Audio encoding. Supports `LINEAR16`, `MULAW`, `ALAW`, `FLAC`, `auto`. |
+| `interim_results` | bool | `true` | Emit interim (partial) recognition results. |
+| `enable_automatic_punctuation` | bool | `true` | Add punctuation to recognition results. |
+| `enable_word_time_offsets` | bool | `true` | Include word-level timestamps. |
+| `enable_speaker_diarization` | bool | `false` | Enable speaker diarization. |
+| `diarization_speaker_count` | int | `0` | Number of speakers (0 = auto detect). |
+| `profanity_filter` | bool | `false` | Filter profanity from results. |
+| `max_retry_attempts` | int | `3` | Maximum reconnection attempts on failure. |
+| `retry_delay` | float | `1.0` | Seconds between reconnection attempts. |
+| `stream_max_duration` | int | `270` | Max stream duration in seconds before reconnect. |
+| `dump` | bool | `false` | Dump incoming audio to a PCM file for debugging. |
+| `dump_path` | string | `"."` | Directory path for audio dump files. |
+
+## Environment Variables
+
+| Variable | Description |
+|----------|-------------|
+| `GOOGLE_ASR_PROJECT_ID` | Google Cloud project ID |
+| `GOOGLE_APPLICATION_CREDENTIALS_PATH` | Path to the service account credentials JSON file |
+| `GOOGLE_APPLICATION_CREDENTIALS_STRING` | Service account credentials as a JSON string |
diff --git a/ai_agents/agents/ten_packages/extension/stepfun_tts_python/README.md b/ai_agents/agents/ten_packages/extension/stepfun_tts_python/README.md
new file mode 100644
index 0000000000..ccf5a1fd57
--- /dev/null
+++ b/ai_agents/agents/ten_packages/extension/stepfun_tts_python/README.md
@@ -0,0 +1,48 @@
+# StepFun TTS Extension
+
+A TEN Framework extension for text-to-speech synthesis using the [StepFun Realtime Audio API](https://platform.stepfun.com/docs/audio/realtimeaudio).
+
+## Configuration
+
+All parameters are configured through the `params` object in `property.json`.
+
+### Basic configuration
+
+```json
+{
+    "params": {
+        "api_key": "${env:STEPFUN_TTS_KEY|}",
+        "base_url": "wss://api.stepfun.com/v1/realtime/audio",
+        "model": "step-tts-mini",
+        "voice_id": "cixingnansheng"
+    }
+}
+```
+
+### Custom voice and model
+
+```json
+{
+    "params": {
+        "api_key": "${env:STEPFUN_TTS_KEY|}",
+        "base_url": "wss://api.stepfun.com/v1/realtime/audio",
+        "model": "step-tts-mini",
+        "voice_id": "wenrouzhixin"
+    }
+}
+```
+
+## Configuration Parameters
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `api_key` | string | `""` | StepFun API key. |
+| `base_url` | string | `"wss://api.stepfun.com/v1/realtime/audio"` | StepFun realtime audio WebSocket endpoint. |
+| `model` | string | `"step-tts-mini"` | TTS model name. |
+| `voice_id` | string | `"cixingnansheng"` | Voice identifier for synthesis. |
+
+## Environment Variables
+
+| Variable | Description |
+|----------|-------------|
+| `STEPFUN_TTS_KEY` | StepFun API key |