From 7a98841a9f2fe838bfd5f9efcf678c83d126a6d3 Mon Sep 17 00:00:00 2001
From: simon <s.tretter@gmail.com>
Date: Thu, 13 Nov 2025 11:06:52 +0100
Subject: [PATCH 1/7] elevenlabs stt support including scribe v2 realtime

---
 plugins/elevenlabs/README.stt.md | 213 ++++++++++++++
 plugins/elevenlabs/src/index.ts  |   2 +
 plugins/elevenlabs/src/models.ts |  47 +++
 plugins/elevenlabs/src/stt.ts    | 471 +++++++++++++++++++++++++++++++
 4 files changed, 733 insertions(+)
 create mode 100644 plugins/elevenlabs/README.stt.md
 create mode 100644 plugins/elevenlabs/src/stt.ts

diff --git a/plugins/elevenlabs/README.stt.md b/plugins/elevenlabs/README.stt.md
new file mode 100644
index 000000000..095c138f1
--- /dev/null
+++ b/plugins/elevenlabs/README.stt.md
@@ -0,0 +1,213 @@
+# ElevenLabs STT Plugin for LiveKit Agents
+
+This plugin provides speech-to-text capabilities using ElevenLabs Scribe API for LiveKit agents.
+
+## Features
+
+- **Multiple Model Support**: Choose between Scribe v1, v2, and v2 realtime
+- **Streaming & Non-Streaming**: Support for both batch and real-time transcription
+- **Multi-Language**: Supports 35+ languages with automatic language detection
+- **Audio Event Tagging**: Optional tagging of non-speech audio events (laughter, footsteps, etc.)
+- **VAD Configuration**: Customizable voice activity detection for streaming mode
+
+## Installation
+
+```bash
+pnpm add @livekit/agents-plugin-elevenlabs
+```
+
+## Supported Models
+
+### Scribe v1 (`scribe_v1`)
+- **Type**: Non-streaming
+- **Method**: HTTP POST
+- **Use Case**: Batch transcription of pre-recorded audio
+- **Features**: Audio event tagging, language detection
+
+### Scribe v2 (`scribe_v2`)
+- **Type**: Non-streaming
+- **Method**: HTTP POST
+- **Use Case**: Improved accuracy for batch transcription
+- **Features**: Enhanced model, language detection
+
+### Scribe v2 Realtime (`scribe_v2_realtime`)
+- **Type**: Streaming
+- **Method**: WebSocket
+- **Use Case**: Real-time conversation transcription
+- **Features**: Interim results, VAD-based segmentation, manual commit support
+
+## Quick Start
+
+### Non-Streaming (Scribe v1)
+
+```typescript
+import { STT } from '@livekit/agents-plugin-elevenlabs';
+
+const stt = new STT({
+  apiKey: process.env.ELEVEN_API_KEY, // or set ELEVEN_API_KEY env var
+  model: 'scribe_v1',
+  languageCode: 'en',
+  tagAudioEvents: true,
+});
+```
+
+### Streaming (Scribe v2 Realtime)
+
+```typescript
+import { STT } from '@livekit/agents-plugin-elevenlabs';
+import { SpeechEventType } from '@livekit/agents';
+
+const stt = new STT({
+  model: 'scribe_v2_realtime',  // default
+  sampleRate: 16000,
+  languageCode: 'en',
+  commitStrategy: 'vad', // auto-commit on speech end
+  vadSilenceThresholdSecs: 1.0,
+});
+```
+
+## Configuration Options
+
+### Common Options
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `apiKey` | `string` | `process.env.ELEVEN_API_KEY` | ElevenLabs API key |
+| `baseURL` | `string` | `https://api.elevenlabs.io/v1` | API base URL |
+| `model` | `STTModels` | `'scribe_v1'` | Model to use |
+| `languageCode` | `string` | `undefined` | Language code (auto-detected if not set) |
+
+### Non-Streaming Options (v1, v2)
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `tagAudioEvents` | `boolean` | `true` | Tag non-speech events like (laughter) |
+
+### Streaming Options (v2_realtime)
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `sampleRate` | `number` | `16000` | Audio sample rate in Hz (16000, 22050, or 44100) |
+| `numChannels` | `number` | `1` | Number of audio channels |
+| `commitStrategy` | `'vad' \| 'manual'` | `'vad'` | How to commit transcripts |
+| `vadSilenceThresholdSecs` | `number` | `undefined` | VAD silence threshold (0.3-3.0 seconds) |
+| `vadThreshold` | `number` | `undefined` | VAD threshold (0.1-0.9) |
+| `minSpeechDurationMs` | `number` | `undefined` | Minimum speech duration (50-2000 ms) |
+| `minSilenceDurationMs` | `number` | `undefined` | Minimum silence duration (50-2000 ms) |
+
+## Supported Languages
+
+The plugin supports 35+ languages including:
+
+- **English** (`en`)
+- **Spanish** (`es`)
+- **French** (`fr`)
+- **German** (`de`)
+- **Italian** (`it`)
+- **Portuguese** (`pt`)
+- **Polish** (`pl`)
+- **Dutch** (`nl`)
+- **Swedish** (`sv`)
+- **Finnish** (`fi`)
+- **Danish** (`da`)
+- **Norwegian** (`no`)
+- **Czech** (`cs`)
+- **Romanian** (`ro`)
+- **Slovak** (`sk`)
+- **Ukrainian** (`uk`)
+- **Greek** (`el`)
+- **Turkish** (`tr`)
+- **Russian** (`ru`)
+- **Bulgarian** (`bg`)
+- **Croatian** (`hr`)
+- **Serbian** (`sr`)
+- **Hungarian** (`hu`)
+- **Lithuanian** (`lt`)
+- **Latvian** (`lv`)
+- **Estonian** (`et`)
+- **Japanese** (`ja`)
+- **Chinese** (`zh`)
+- **Korean** (`ko`)
+- **Hindi** (`hi`)
+- **Arabic** (`ar`)
+- **Persian** (`fa`)
+- **Hebrew** (`he`)
+- **Indonesian** (`id`)
+- **Malay** (`ms`)
+- **Thai** (`th`)
+- **Vietnamese** (`vi`)
+- **Tamil** (`ta`)
+- **Urdu** (`ur`)
+
+## Advanced Usage
+
+### Custom VAD Parameters
+
+Fine-tune voice activity detection for your use case:
+
+```typescript
+const stt = new STT({
+  model: 'scribe_v2_realtime',
+  commitStrategy: 'vad',
+
+  // Longer silence before committing (good for thoughtful speakers)
+  vadSilenceThresholdSecs: 2.0,
+
+  // Higher threshold = more strict about what's considered speech
+  vadThreshold: 0.7,
+
+  // Ignore very short speech bursts (reduce false positives)
+  minSpeechDurationMs: 200,
+
+  // Require longer silence to end speech (reduce fragmentation)
+  minSilenceDurationMs: 500,
+});
+```
+
+### Multi-Language Support
+
+Let ElevenLabs auto-detect the language:
+
+```typescript
+const stt = new STT({
+  model: 'scribe_v1',
+  // Don't set languageCode - will auto-detect
+});
+
+const event = await stt.recognize(audioBuffer);
+console.log('Detected language:', event.alternatives[0].language);
+console.log('Text:', event.alternatives[0].text);
+```
+
+Or specify a language:
+
+```typescript
+const stt = new STT({
+  model: 'scribe_v2_realtime',
+  languageCode: 'es', // Spanish
+});
+```
+
+## Model Comparison
+
+| Feature | Scribe v1 | Scribe v2 | Scribe v2 Realtime |
+|---------|-----------|-----------|-------------------|
+| **Type** | Non-streaming | Non-streaming | Streaming |
+| **Latency** | High (batch) | High (batch) | Low (real-time) |
+| **Interim Results** | ❌ | ❌ | ✅ |
+| **Audio Event Tagging** | ✅ | ❌ | ❌ |
+| **VAD Configuration** | ❌ | ❌ | ✅ |
+| **Manual Commit** | ❌ | ❌ | ✅ |
+| **Best For** | Batch jobs with event detection | High-accuracy batch | Real-time conversations |
+
+## Resources
+
+- [ElevenLabs STT Documentation](https://elevenlabs.io/docs/api-reference/speech-to-text)
+- [Scribe v2 Streaming Guide](https://elevenlabs.io/docs/cookbooks/speech-to-text/streaming)
+- [LiveKit Agents Documentation](https://docs.livekit.io/agents/)
+
+## License
+
+Copyright 2025 LiveKit, Inc.
+
+Licensed under the Apache License, Version 2.0.
diff --git a/plugins/elevenlabs/src/index.ts b/plugins/elevenlabs/src/index.ts
index 66c4eeff6..29a7ec457 100644
--- a/plugins/elevenlabs/src/index.ts
+++ b/plugins/elevenlabs/src/index.ts
@@ -3,6 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 import { Plugin } from '@livekit/agents';
 
+export * from './models.js';
+export * from './stt.js';
 export * from './tts.js';
 
 class ElevenLabsPlugin extends Plugin {
diff --git a/plugins/elevenlabs/src/models.ts b/plugins/elevenlabs/src/models.ts
index 59e419145..573844d2f 100644
--- a/plugins/elevenlabs/src/models.ts
+++ b/plugins/elevenlabs/src/models.ts
@@ -21,3 +21,50 @@ export type TTSEncoding =
   // | 'mp3_44100_128'
   // | 'mp3_44100_192'
   'pcm_16000' | 'pcm_22050' | 'pcm_44100';
+
+export type STTModels = 'scribe_v1' | 'scribe_v2' | 'scribe_v2_realtime';
+
+export type STTAudioFormat = 'pcm_16000' | 'pcm_22050' | 'pcm_44100';
+
+export type STTCommitStrategy = 'vad' | 'manual';
+
+export type STTLanguages =
+  | 'en'
+  | 'es'
+  | 'fr'
+  | 'de'
+  | 'it'
+  | 'pt'
+  | 'pl'
+  | 'nl'
+  | 'sv'
+  | 'fi'
+  | 'da'
+  | 'no'
+  | 'cs'
+  | 'ro'
+  | 'sk'
+  | 'uk'
+  | 'el'
+  | 'tr'
+  | 'ru'
+  | 'bg'
+  | 'hr'
+  | 'sr'
+  | 'hu'
+  | 'lt'
+  | 'lv'
+  | 'et'
+  | 'ja'
+  | 'zh'
+  | 'ko'
+  | 'hi'
+  | 'ar'
+  | 'fa'
+  | 'he'
+  | 'id'
+  | 'ms'
+  | 'th'
+  | 'vi'
+  | 'ta'
+  | 'ur';
diff --git a/plugins/elevenlabs/src/stt.ts b/plugins/elevenlabs/src/stt.ts
new file mode 100644
index 000000000..6aedae0aa
--- /dev/null
+++ b/plugins/elevenlabs/src/stt.ts
@@ -0,0 +1,471 @@
+// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import {
+  APIConnectionError,
+  APIStatusError,
+  APITimeoutError,
+  type AudioBuffer,
+  AudioByteStream,
+  log,
+  mergeFrames,
+  stt,
+} from '@livekit/agents';
+import type { AudioFrame } from '@livekit/rtc-node';
+import { WebSocket } from 'ws';
+import type { STTAudioFormat, STTCommitStrategy, STTLanguages, STTModels } from './models.js';
+
+const API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1';
+const AUTHORIZATION_HEADER = 'xi-api-key';
+
+export interface STTOptions {
+  apiKey?: string;
+  baseURL: string;
+  model: STTModels;
+  languageCode?: STTLanguages | string;
+  tagAudioEvents: boolean;
+  sampleRate: number;
+  numChannels: number;
+  // Streaming-specific options (only used for scribe_v2_realtime)
+  commitStrategy: STTCommitStrategy;
+  vadSilenceThresholdSecs?: number;
+  vadThreshold?: number;
+  minSpeechDurationMs?: number;
+  minSilenceDurationMs?: number;
+}
+
+const defaultSTTOptions: STTOptions = {
+  apiKey: process.env.ELEVEN_API_KEY,
+  baseURL: API_BASE_URL_V1,
+  model: 'scribe_v2_realtime',
+  tagAudioEvents: true,
+  sampleRate: 16000,
+  numChannels: 1,
+  commitStrategy: 'vad',
+};
+
+export class STT extends stt.STT {
+  #opts: STTOptions;
+  #logger = log();
+  label = 'elevenlabs.STT';
+
+  /**
+   * Create a new instance of ElevenLabs STT.
+   *
+   * @remarks
+   * `apiKey` must be set to your ElevenLabs API key, either using the argument or by setting the
+   * `ELEVEN_API_KEY` environment variable.
+   *
+   * @param opts - Configuration options for the STT service
+   * @param opts.apiKey - ElevenLabs API key (defaults to ELEVEN_API_KEY env var)
+   * @param opts.baseURL - Base URL for the API (defaults to https://api.elevenlabs.io/v1)
+   * @param opts.model - Model to use: 'scribe_v1' (non-streaming), 'scribe_v2' (non-streaming), or 'scribe_v2_realtime' (streaming)
+   * @param opts.languageCode - Language code for transcription (optional, auto-detected if not set)
+   * @param opts.tagAudioEvents - Whether to tag audio events like (laughter), (footsteps), etc. (defaults to true, scribe_v1 only)
+   * @param opts.sampleRate - Sample rate for audio (defaults to 16000)
+   * @param opts.numChannels - Number of audio channels (defaults to 1)
+   * @param opts.commitStrategy - Commit strategy: 'vad' (auto) or 'manual' (defaults to 'vad', scribe_v2_realtime only)
+   * @param opts.vadSilenceThresholdSecs - VAD silence threshold in seconds, 0.3-3.0 (scribe_v2_realtime only)
+   * @param opts.vadThreshold - VAD threshold, 0.1-0.9 (scribe_v2_realtime only)
+   * @param opts.minSpeechDurationMs - Minimum speech duration in ms, 50-2000 (scribe_v2_realtime only)
+   * @param opts.minSilenceDurationMs - Minimum silence duration in ms, 50-2000 (scribe_v2_realtime only)
+   */
+  constructor(opts: Partial<STTOptions> = defaultSTTOptions) {
+    const mergedOpts = { ...defaultSTTOptions, ...opts };
+    const isStreaming = mergedOpts.model === 'scribe_v2_realtime';
+
+    super({
+      streaming: isStreaming,
+      interimResults: isStreaming,
+    });
+
+    this.#opts = mergedOpts;
+
+    if (this.#opts.apiKey === undefined) {
+      throw new Error(
+        'ElevenLabs API key is required, whether as an argument or as $ELEVEN_API_KEY',
+      );
+    }
+  }
+
+  #createWav(frame: AudioFrame): Buffer {
+    const bitsPerSample = 16;
+    const byteRate = (frame.sampleRate * frame.channels * bitsPerSample) / 8;
+    const blockAlign = (frame.channels * bitsPerSample) / 8;
+
+    const header = Buffer.alloc(44);
+    header.write('RIFF', 0);
+    header.writeUInt32LE(36 + frame.data.byteLength, 4);
+    header.write('WAVE', 8);
+    header.write('fmt ', 12);
+    header.writeUInt32LE(16, 16);
+    header.writeUInt16LE(1, 20);
+    header.writeUInt16LE(frame.channels, 22);
+    header.writeUInt32LE(frame.sampleRate, 24);
+    header.writeUInt32LE(byteRate, 28);
+    header.writeUInt16LE(blockAlign, 32);
+    header.writeUInt16LE(16, 34);
+    header.write('data', 36);
+    header.writeUInt32LE(frame.data.byteLength, 40);
+    return Buffer.concat([header, Buffer.from(frame.data.buffer)]);
+  }
+
+  async _recognize(buffer: AudioBuffer, language?: string): Promise<stt.SpeechEvent> {
+    if (this.#opts.model === 'scribe_v2_realtime') {
+      throw new Error(
+        'scribe_v2_realtime requires streaming. Use stream() method instead, or use scribe_v1/scribe_v2 for non-streaming recognize()',
+      );
+    }
+
+    const mergedBuffer = mergeFrames(buffer);
+    const wavBytes = this.#createWav(mergedBuffer);
+
+    // Create form data for the request
+    const form = new FormData();
+    form.append('file', new Blob([wavBytes], { type: 'audio/wav' }), 'audio.wav');
+    form.append('model_id', this.#opts.model);
+    form.append('tag_audio_events', this.#opts.tagAudioEvents.toString());
+
+    // Add language code if provided (either from options or recognize call)
+    const languageCode = language || this.#opts.languageCode;
+    if (languageCode) {
+      form.append('language_code', languageCode);
+    }
+
+    try {
+      const controller = new AbortController();
+      const timeout = setTimeout(() => controller.abort(), 30000); // 30 second timeout
+
+      const response = await fetch(`${this.#opts.baseURL}/speech-to-text`, {
+        method: 'POST',
+        headers: {
+          [AUTHORIZATION_HEADER]: this.#opts.apiKey!,
+        },
+        body: form,
+        signal: controller.signal,
+      });
+
+      clearTimeout(timeout);
+
+      if (!response.ok) {
+        const errorText = await response.text();
+        throw new APIStatusError({
+          message: `ElevenLabs API error: ${response.statusText} - ${errorText}`,
+          options: {
+            statusCode: response.status,
+            requestId: null,
+            body: null,
+            retryable: response.status >= 500,
+          },
+        });
+      }
+
+      const responseJson = await response.json();
+      const extractedText = responseJson.text || '';
+      const detectedLanguage = responseJson.language_code || languageCode || 'en';
+      const words = responseJson.words || [];
+
+      let startTime = 0;
+      let endTime = 0;
+
+      if (words.length > 0) {
+        startTime = Math.min(...words.map((w: any) => w.start || 0));
+        endTime = Math.max(...words.map((w: any) => w.end || 0));
+      }
+
+      return {
+        type: stt.SpeechEventType.FINAL_TRANSCRIPT,
+        alternatives: [
+          {
+            text: extractedText,
+            language: detectedLanguage,
+            startTime,
+            endTime,
+            confidence: 1.0, // ElevenLabs doesn't provide confidence scores
+          },
+        ],
+      };
+    } catch (error) {
+      if (error instanceof APIStatusError) {
+        throw error;
+      }
+      if ((error as any).name === 'AbortError') {
+        throw new APITimeoutError({
+          message: 'ElevenLabs API request timed out',
+          options: { retryable: true },
+        });
+      }
+      throw new APIConnectionError({
+        message: `Failed to connect to ElevenLabs: ${error}`,
+        options: { retryable: true },
+      });
+    }
+  }
+
+  updateOptions(opts: Partial<STTOptions>) {
+    this.#opts = { ...this.#opts, ...opts };
+  }
+
+  stream(): SpeechStream {
+    if (this.#opts.model !== 'scribe_v2_realtime') {
+      throw new Error(
+        'Streaming is only supported with scribe_v2_realtime model. For non-streaming, use recognize() method.',
+      );
+    }
+    return new SpeechStream(this, this.#opts);
+  }
+}
+
+export class SpeechStream extends stt.SpeechStream {
+  #opts: STTOptions;
+  #logger = log();
+  #speaking = false;
+  #lastCommittedText = '';
+  label = 'elevenlabs.SpeechStream';
+
+  constructor(stt: STT, opts: STTOptions) {
+    super(stt, opts.sampleRate);
+    this.#opts = opts;
+    this.closed = false;
+  }
+
+  protected async run() {
+    const maxRetry = 32;
+    let retries = 0;
+    let ws: WebSocket;
+
+    while (!this.input.closed) {
+      // Build WebSocket URL
+      const audioFormat: STTAudioFormat = `pcm_${this.#opts.sampleRate}` as STTAudioFormat;
+      const baseUrl = this.#opts.baseURL.replace('https://', 'wss://').replace('http://', 'ws://');
+      const streamURL = new URL(`${baseUrl}/speech-to-text/realtime`);
+
+      const params = {
+        model_id: this.#opts.model,
+        encoding: audioFormat,
+        sample_rate: this.#opts.sampleRate,
+        commit_strategy: this.#opts.commitStrategy,
+        vad_silence_threshold_secs: this.#opts.vadSilenceThresholdSecs,
+        vad_threshold: this.#opts.vadThreshold,
+        min_speech_duration_ms: this.#opts.minSpeechDurationMs,
+        min_silence_duration_ms: this.#opts.minSilenceDurationMs,
+        language_code: this.#opts.languageCode,
+      };
+
+      Object.entries(params).forEach(([k, v]) => {
+        if (v !== undefined) {
+          streamURL.searchParams.append(k, String(v));
+        }
+      });
+
+      ws = new WebSocket(streamURL.toString(), {
+        headers: { [AUTHORIZATION_HEADER]: `${this.#opts.apiKey}` },
+      });
+
+      try {
+        await new Promise((resolve, reject) => {
+          ws.on('open', resolve);
+          ws.on('error', (error) => reject(error));
+          ws.on('close', (code) => reject(`WebSocket returned ${code}`));
+        });
+
+        await this.#runWS(ws);
+      } catch (e) {
+        if (retries >= maxRetry) {
+          throw new Error(`failed to connect to ElevenLabs after ${retries} attempts: ${e}`);
+        }
+
+        const delay = Math.min(retries * 5, 10);
+        retries++;
+
+        this.#logger.warn(
+          `failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,
+        );
+        await new Promise((resolve) => setTimeout(resolve, delay * 1000));
+      }
+    }
+
+    this.closed = true;
+  }
+
+  async #runWS(ws: WebSocket) {
+    let closing = false;
+
+    const keepalive = setInterval(() => {
+      try {
+        if (ws.readyState === WebSocket.OPEN) {
+          ws.send(JSON.stringify({ message_type: 'keepalive' }));
+        }
+      } catch {
+        clearInterval(keepalive);
+        return;
+      }
+    }, 5000);
+
+    const sendTask = async () => {
+      const samples100Ms = Math.floor(this.#opts.sampleRate / 10);
+      const stream = new AudioByteStream(
+        this.#opts.sampleRate,
+        this.#opts.numChannels,
+        samples100Ms,
+      );
+
+      for await (const data of this.input) {
+        let frames: AudioFrame[];
+        if (data === SpeechStream.FLUSH_SENTINEL) {
+          frames = stream.flush();
+
+          // Send any remaining frames
+          for (const frame of frames) {
+            const audioB64 = Buffer.from(frame.data.buffer).toString('base64');
+            ws.send(
+              JSON.stringify({
+                message_type: 'input_audio_chunk',
+                audio_base_64: audioB64,
+                commit: false,
+                sample_rate: this.#opts.sampleRate,
+              }),
+            );
+          }
+
+          // Send commit message if using manual commit strategy
+          if (this.#opts.commitStrategy === 'manual') {
+            ws.send(
+              JSON.stringify({
+                message_type: 'input_audio_chunk',
+                audio_base_64: '',
+                commit: true,
+                sample_rate: this.#opts.sampleRate,
+              }),
+            );
+          }
+        } else {
+          if (
+            data.sampleRate !== this.#opts.sampleRate ||
+            data.channels !== this.#opts.numChannels
+          ) {
+            throw new Error(
+              `sample rate or channel count of frame does not match (expected ${this.#opts.sampleRate}/${this.#opts.numChannels}, got ${data.sampleRate}/${data.channels})`,
+            );
+          }
+          frames = stream.write(data.data.buffer);
+
+          for (const frame of frames) {
+            const audioB64 = Buffer.from(frame.data.buffer).toString('base64');
+            ws.send(
+              JSON.stringify({
+                message_type: 'input_audio_chunk',
+                audio_base_64: audioB64,
+                commit: false,
+                sample_rate: this.#opts.sampleRate,
+              }),
+            );
+          }
+        }
+      }
+
+      closing = true;
+    };
+
+    const wsMonitor = new Promise<void>((_, reject) =>
+      ws.once('close', (code, reason) => {
+        if (!closing) {
+          this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);
+          reject(new Error('WebSocket closed'));
+        }
+      }),
+    );
+
+    const listenTask = async () => {
+      await new Promise<void>((resolve) => {
+        ws.on('message', (msg) => {
+          const json = JSON.parse(msg.toString());
+          this.#processStreamEvent(json);
+
+          if (this.closed || closing) {
+            resolve();
+          }
+        });
+      });
+    };
+
+    await Promise.all([sendTask(), listenTask(), wsMonitor]);
+    closing = true;
+    ws.close();
+    clearInterval(keepalive);
+  }
+
+  #processStreamEvent(data: any) {
+    const messageType = data.message_type;
+
+    if (messageType === 'partial_transcript') {
+      const text = data.text || '';
+
+      // Ignore stale partial transcripts that match the last committed text
+      if (text && text === this.#lastCommittedText) {
+        return;
+      }
+
+      if (text) {
+        // Send START_OF_SPEECH if this is the first transcript in a new segment
+        if (!this.#speaking) {
+          this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH });
+          this.#speaking = true;
+          this.#lastCommittedText = '';
+        }
+
+        this.queue.put({
+          type: stt.SpeechEventType.INTERIM_TRANSCRIPT,
+          alternatives: [
+            {
+              text,
+              language: this.#opts.languageCode || 'en',
+              startTime: 0,
+              endTime: 0,
+              confidence: 1.0,
+            },
+          ],
+        });
+      }
+    } else if (
+      messageType === 'committed_transcript' ||
+      messageType === 'committed_transcript_with_timestamps'
+    ) {
+      const text = data.text || '';
+
+      if (text) {
+        // Send START_OF_SPEECH if we get a FINAL without any INTERIM first
+        if (!this.#speaking) {
+          this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH });
+        }
+
+        this.queue.put({
+          type: stt.SpeechEventType.FINAL_TRANSCRIPT,
+          alternatives: [
+            {
+              text,
+              language: this.#opts.languageCode || 'en',
+              startTime: 0,
+              endTime: 0,
+              confidence: 1.0,
+            },
+          ],
+        });
+
+        // Send end of speech event
+        this.queue.put({ type: stt.SpeechEventType.END_OF_SPEECH });
+        this.#speaking = false;
+        this.#lastCommittedText = text;
+      } else {
+        // Empty commit - just reset state
+        this.#speaking = false;
+        this.#lastCommittedText = '';
+      }
+    } else if (messageType === 'session_started') {
+      const sessionId = data.session_id || 'unknown';
+      this.#logger.info(`ElevenLabs session started with ID: ${sessionId}`);
+    } else {
+      this.#logger.warn(`Unknown message type: ${messageType}`);
+    }
+  }
+}

From 43b0dd1f2d09477f32ecacf924e7f0703ff3b820 Mon Sep 17 00:00:00 2001
From: simon <s.tretter@gmail.com>
Date: Thu, 13 Nov 2025 11:57:37 +0100
Subject: [PATCH 2/7] improve logging and error handling

also merge README
---
 plugins/elevenlabs/README.md     | 207 +++++++++++++++++++++++++++++-
 plugins/elevenlabs/README.stt.md | 213 -------------------------------
 plugins/elevenlabs/src/stt.ts    |  55 ++++----
 3 files changed, 237 insertions(+), 238 deletions(-)
 delete mode 100644 plugins/elevenlabs/README.stt.md

diff --git a/plugins/elevenlabs/README.md b/plugins/elevenlabs/README.md
index 77764172b..7fd5e59f0 100644
--- a/plugins/elevenlabs/README.md
+++ b/plugins/elevenlabs/README.md
@@ -3,15 +3,218 @@ SPDX-FileCopyrightText: 2024 LiveKit, Inc.
 
 SPDX-License-Identifier: Apache-2.0
 -->
-# ElevenLabs plugin for LiveKit Agents
+# ElevenLabs Plugin for LiveKit Agents
 
 The Agents Framework is designed for building realtime, programmable
 participants that run on servers. Use it to create conversational, multi-modal
 voice agents that can see, hear, and understand.
 
-This package contains the ElevenLabs plugin, which allows for voice synthesis.
+This package contains the ElevenLabs plugin, which provides:
+- **Text-to-Speech (TTS)**: High-quality voice synthesis with multiple voices and models
+- **Speech-to-Text (STT)**: Real-time and batch transcription with Scribe API
+
 Refer to the [documentation](https://docs.livekit.io/agents/overview/) for
 information on how to use it, or browse the [API
 reference](https://docs.livekit.io/agents-js/modules/plugins_agents_plugin_elevenlabs.html).
 See the [repository](https://github.com/livekit/agents-js) for more information
 about the framework as a whole.
+
+## Installation
+
+```bash
+pnpm add @livekit/agents-plugin-elevenlabs
+```
+
+Set your ElevenLabs API key:
+```bash
+export ELEVEN_API_KEY=your_api_key_here
+```
+
+---
+
+## Text-to-Speech (TTS)
+
+For TTS documentation, refer to the [API reference](https://docs.livekit.io/agents-js/modules/plugins_agents_plugin_elevenlabs.html).
+
+### Quick Example
+
+```typescript
+import { TTS } from '@livekit/agents-plugin-elevenlabs';
+
+const tts = new TTS();
+// Use tts for voice synthesis
+```
+
+---
+
+## Speech-to-Text (STT)
+
+### Features
+
+- **Multiple Model Support**: Choose between Scribe v1, v2, and v2 realtime
+- **Streaming & Non-Streaming**: Support for both batch and real-time transcription
+- **Multi-Language**: Supports 35+ languages with automatic language detection
+- **Audio Event Tagging**: Optional tagging of non-speech audio events (laughter, footsteps, etc.)
+- **VAD Configuration**: Customizable voice activity detection for streaming mode
+
+### Supported Models
+
+#### Scribe v1 (`scribe_v1`)
+- **Type**: Non-streaming
+- **Method**: HTTP POST
+- **Use Case**: Batch transcription of pre-recorded audio
+- **Features**: Audio event tagging, language detection
+
+#### Scribe v2 (`scribe_v2`)
+- **Type**: Non-streaming
+- **Method**: HTTP POST
+- **Use Case**: Improved accuracy for batch transcription
+- **Features**: Enhanced model, language detection
+
+#### Scribe v2 Realtime (`scribe_v2_realtime`)
+- **Type**: Streaming (default)
+- **Method**: WebSocket
+- **Use Case**: Real-time conversation transcription
+- **Features**: Interim results, VAD-based segmentation, manual commit support
+
+### Quick Start
+
+#### Non-Streaming (Scribe v1)
+
+```typescript
+import { STT } from '@livekit/agents-plugin-elevenlabs';
+
+const stt = new STT({
+  apiKey: process.env.ELEVEN_API_KEY, // or set ELEVEN_API_KEY env var
+  model: 'scribe_v1',
+  languageCode: 'en',
+  tagAudioEvents: true,
+});
+```
+
+#### Streaming (Scribe v2 Realtime)
+
+```typescript
+import { STT } from '@livekit/agents-plugin-elevenlabs';
+import { SpeechEventType } from '@livekit/agents';
+
+const stt = new STT({
+  model: 'scribe_v2_realtime',  // default
+  sampleRate: 16000,
+  languageCode: 'en',
+  commitStrategy: 'vad', // auto-commit on speech end
+  vadSilenceThresholdSecs: 1.0,
+});
+```
+
+### Configuration Options
+
+#### Common Options
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `apiKey` | `string` | `process.env.ELEVEN_API_KEY` | ElevenLabs API key |
+| `baseURL` | `string` | `https://api.elevenlabs.io/v1` | API base URL |
+| `model` | `STTModels` | `'scribe_v2_realtime'` | Model to use |
+| `languageCode` | `string` | `undefined` | Language code (auto-detected if not set) |
+
+#### Non-Streaming Options (v1, v2)
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `tagAudioEvents` | `boolean` | `true` | Tag non-speech events like (laughter) |
+
+#### Streaming Options (v2_realtime)
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `sampleRate` | `number` | `16000` | Audio sample rate in Hz (16000, 22050, or 44100) |
+| `numChannels` | `number` | `1` | Number of audio channels |
+| `commitStrategy` | `'vad' \| 'manual'` | `'vad'` | How to commit transcripts |
+| `vadSilenceThresholdSecs` | `number` | `undefined` | VAD silence threshold (0.3-3.0 seconds) |
+| `vadThreshold` | `number` | `undefined` | VAD threshold (0.1-0.9) |
+| `minSpeechDurationMs` | `number` | `undefined` | Minimum speech duration (50-2000 ms) |
+| `minSilenceDurationMs` | `number` | `undefined` | Minimum silence duration (50-2000 ms) |
+
+### Supported Languages
+
+The STT plugin supports 35+ languages including:
+
+English (`en`), Spanish (`es`), French (`fr`), German (`de`), Italian (`it`), Portuguese (`pt`), Polish (`pl`), Dutch (`nl`), Swedish (`sv`), Finnish (`fi`), Danish (`da`), Norwegian (`no`), Czech (`cs`), Romanian (`ro`), Slovak (`sk`), Ukrainian (`uk`), Greek (`el`), Turkish (`tr`), Russian (`ru`), Bulgarian (`bg`), Croatian (`hr`), Serbian (`sr`), Hungarian (`hu`), Lithuanian (`lt`), Latvian (`lv`), Estonian (`et`), Japanese (`ja`), Chinese (`zh`), Korean (`ko`), Hindi (`hi`), Arabic (`ar`), Persian (`fa`), Hebrew (`he`), Indonesian (`id`), Malay (`ms`), Thai (`th`), Vietnamese (`vi`), Tamil (`ta`), Urdu (`ur`)
+
+### Advanced Usage
+
+#### Custom VAD Parameters
+
+Fine-tune voice activity detection for your use case:
+
+```typescript
+const stt = new STT({
+  model: 'scribe_v2_realtime',
+  commitStrategy: 'vad',
+
+  // Longer silence before committing (good for thoughtful speakers)
+  vadSilenceThresholdSecs: 2.0,
+
+  // Higher threshold = more strict about what's considered speech
+  vadThreshold: 0.7,
+
+  // Ignore very short speech bursts (reduce false positives)
+  minSpeechDurationMs: 200,
+
+  // Require longer silence to end speech (reduce fragmentation)
+  minSilenceDurationMs: 500,
+});
+```
+
+#### Multi-Language Support
+
+Let ElevenLabs auto-detect the language:
+
+```typescript
+const stt = new STT({
+  model: 'scribe_v1',
+  // Don't set languageCode - will auto-detect
+});
+
+const event = await stt.recognize(audioBuffer);
+console.log('Detected language:', event.alternatives[0].language);
+console.log('Text:', event.alternatives[0].text);
+```
+
+Or specify a language:
+
+```typescript
+const stt = new STT({
+  model: 'scribe_v2_realtime',
+  languageCode: 'es', // Spanish
+});
+```
+
+### Model Comparison
+
+| Feature | Scribe v1 | Scribe v2 | Scribe v2 Realtime |
+|---------|-----------|-----------|-------------------|
+| **Type** | Non-streaming | Non-streaming | Streaming |
+| **Latency** | High (batch) | High (batch) | Low (real-time) |
+| **Interim Results** | ❌ | ❌ | ✅ |
+| **Audio Event Tagging** | ✅ | ❌ | ❌ |
+| **VAD Configuration** | ❌ | ❌ | ✅ |
+| **Manual Commit** | ❌ | ❌ | ✅ |
+| **Best For** | Batch jobs with event detection | High-accuracy batch | Real-time conversations |
+
+---
+
+## Resources
+
+- [ElevenLabs TTS Documentation](https://elevenlabs.io/docs/api-reference/text-to-speech)
+- [ElevenLabs STT Documentation](https://elevenlabs.io/docs/api-reference/speech-to-text)
+- [Scribe v2 Streaming Guide](https://elevenlabs.io/docs/cookbooks/speech-to-text/streaming)
+- [LiveKit Agents Documentation](https://docs.livekit.io/agents/)
+- [LiveKit Agents JS Repository](https://github.com/livekit/agents-js)
+
+## License
+
+Copyright 2025 LiveKit, Inc.
+
+Licensed under the Apache License, Version 2.0.
diff --git a/plugins/elevenlabs/README.stt.md b/plugins/elevenlabs/README.stt.md
deleted file mode 100644
index 095c138f1..000000000
--- a/plugins/elevenlabs/README.stt.md
+++ /dev/null
@@ -1,213 +0,0 @@
-# ElevenLabs STT Plugin for LiveKit Agents
-
-This plugin provides speech-to-text capabilities using ElevenLabs Scribe API for LiveKit agents.
-
-## Features
-
-- **Multiple Model Support**: Choose between Scribe v1, v2, and v2 realtime
-- **Streaming & Non-Streaming**: Support for both batch and real-time transcription
-- **Multi-Language**: Supports 35+ languages with automatic language detection
-- **Audio Event Tagging**: Optional tagging of non-speech audio events (laughter, footsteps, etc.)
-- **VAD Configuration**: Customizable voice activity detection for streaming mode
-
-## Installation
-
-```bash
-pnpm add @livekit/agents-plugin-elevenlabs
-```
-
-## Supported Models
-
-### Scribe v1 (`scribe_v1`)
-- **Type**: Non-streaming
-- **Method**: HTTP POST
-- **Use Case**: Batch transcription of pre-recorded audio
-- **Features**: Audio event tagging, language detection
-
-### Scribe v2 (`scribe_v2`)
-- **Type**: Non-streaming
-- **Method**: HTTP POST
-- **Use Case**: Improved accuracy for batch transcription
-- **Features**: Enhanced model, language detection
-
-### Scribe v2 Realtime (`scribe_v2_realtime`)
-- **Type**: Streaming
-- **Method**: WebSocket
-- **Use Case**: Real-time conversation transcription
-- **Features**: Interim results, VAD-based segmentation, manual commit support
-
-## Quick Start
-
-### Non-Streaming (Scribe v1)
-
-```typescript
-import { STT } from '@livekit/agents-plugin-elevenlabs';
-
-const stt = new STT({
-  apiKey: process.env.ELEVEN_API_KEY, // or set ELEVEN_API_KEY env var
-  model: 'scribe_v1',
-  languageCode: 'en',
-  tagAudioEvents: true,
-});
-```
-
-### Streaming (Scribe v2 Realtime)
-
-```typescript
-import { STT } from '@livekit/agents-plugin-elevenlabs';
-import { SpeechEventType } from '@livekit/agents';
-
-const stt = new STT({
-  model: 'scribe_v2_realtime',  // default
-  sampleRate: 16000,
-  languageCode: 'en',
-  commitStrategy: 'vad', // auto-commit on speech end
-  vadSilenceThresholdSecs: 1.0,
-});
-```
-
-## Configuration Options
-
-### Common Options
-
-| Option | Type | Default | Description |
-|--------|------|---------|-------------|
-| `apiKey` | `string` | `process.env.ELEVEN_API_KEY` | ElevenLabs API key |
-| `baseURL` | `string` | `https://api.elevenlabs.io/v1` | API base URL |
-| `model` | `STTModels` | `'scribe_v1'` | Model to use |
-| `languageCode` | `string` | `undefined` | Language code (auto-detected if not set) |
-
-### Non-Streaming Options (v1, v2)
-
-| Option | Type | Default | Description |
-|--------|------|---------|-------------|
-| `tagAudioEvents` | `boolean` | `true` | Tag non-speech events like (laughter) |
-
-### Streaming Options (v2_realtime)
-
-| Option | Type | Default | Description |
-|--------|------|---------|-------------|
-| `sampleRate` | `number` | `16000` | Audio sample rate in Hz (16000, 22050, or 44100) |
-| `numChannels` | `number` | `1` | Number of audio channels |
-| `commitStrategy` | `'vad' \| 'manual'` | `'vad'` | How to commit transcripts |
-| `vadSilenceThresholdSecs` | `number` | `undefined` | VAD silence threshold (0.3-3.0 seconds) |
-| `vadThreshold` | `number` | `undefined` | VAD threshold (0.1-0.9) |
-| `minSpeechDurationMs` | `number` | `undefined` | Minimum speech duration (50-2000 ms) |
-| `minSilenceDurationMs` | `number` | `undefined` | Minimum silence duration (50-2000 ms) |
-
-## Supported Languages
-
-The plugin supports 35+ languages including:
-
-- **English** (`en`)
-- **Spanish** (`es`)
-- **French** (`fr`)
-- **German** (`de`)
-- **Italian** (`it`)
-- **Portuguese** (`pt`)
-- **Polish** (`pl`)
-- **Dutch** (`nl`)
-- **Swedish** (`sv`)
-- **Finnish** (`fi`)
-- **Danish** (`da`)
-- **Norwegian** (`no`)
-- **Czech** (`cs`)
-- **Romanian** (`ro`)
-- **Slovak** (`sk`)
-- **Ukrainian** (`uk`)
-- **Greek** (`el`)
-- **Turkish** (`tr`)
-- **Russian** (`ru`)
-- **Bulgarian** (`bg`)
-- **Croatian** (`hr`)
-- **Serbian** (`sr`)
-- **Hungarian** (`hu`)
-- **Lithuanian** (`lt`)
-- **Latvian** (`lv`)
-- **Estonian** (`et`)
-- **Japanese** (`ja`)
-- **Chinese** (`zh`)
-- **Korean** (`ko`)
-- **Hindi** (`hi`)
-- **Arabic** (`ar`)
-- **Persian** (`fa`)
-- **Hebrew** (`he`)
-- **Indonesian** (`id`)
-- **Malay** (`ms`)
-- **Thai** (`th`)
-- **Vietnamese** (`vi`)
-- **Tamil** (`ta`)
-- **Urdu** (`ur`)
-
-## Advanced Usage
-
-### Custom VAD Parameters
-
-Fine-tune voice activity detection for your use case:
-
-```typescript
-const stt = new STT({
-  model: 'scribe_v2_realtime',
-  commitStrategy: 'vad',
-
-  // Longer silence before committing (good for thoughtful speakers)
-  vadSilenceThresholdSecs: 2.0,
-
-  // Higher threshold = more strict about what's considered speech
-  vadThreshold: 0.7,
-
-  // Ignore very short speech bursts (reduce false positives)
-  minSpeechDurationMs: 200,
-
-  // Require longer silence to end speech (reduce fragmentation)
-  minSilenceDurationMs: 500,
-});
-```
-
-### Multi-Language Support
-
-Let ElevenLabs auto-detect the language:
-
-```typescript
-const stt = new STT({
-  model: 'scribe_v1',
-  // Don't set languageCode - will auto-detect
-});
-
-const event = await stt.recognize(audioBuffer);
-console.log('Detected language:', event.alternatives[0].language);
-console.log('Text:', event.alternatives[0].text);
-```
-
-Or specify a language:
-
-```typescript
-const stt = new STT({
-  model: 'scribe_v2_realtime',
-  languageCode: 'es', // Spanish
-});
-```
-
-## Model Comparison
-
-| Feature | Scribe v1 | Scribe v2 | Scribe v2 Realtime |
-|---------|-----------|-----------|-------------------|
-| **Type** | Non-streaming | Non-streaming | Streaming |
-| **Latency** | High (batch) | High (batch) | Low (real-time) |
-| **Interim Results** | ❌ | ❌ | ✅ |
-| **Audio Event Tagging** | ✅ | ❌ | ❌ |
-| **VAD Configuration** | ❌ | ❌ | ✅ |
-| **Manual Commit** | ❌ | ❌ | ✅ |
-| **Best For** | Batch jobs with event detection | High-accuracy batch | Real-time conversations |
-
-## Resources
-
-- [ElevenLabs STT Documentation](https://elevenlabs.io/docs/api-reference/speech-to-text)
-- [Scribe v2 Streaming Guide](https://elevenlabs.io/docs/cookbooks/speech-to-text/streaming)
-- [LiveKit Agents Documentation](https://docs.livekit.io/agents/)
-
-## License
-
-Copyright 2025 LiveKit, Inc.
-
-Licensed under the Apache License, Version 2.0.
diff --git a/plugins/elevenlabs/src/stt.ts b/plugins/elevenlabs/src/stt.ts
index 6aedae0aa..5f52fc8f7 100644
--- a/plugins/elevenlabs/src/stt.ts
+++ b/plugins/elevenlabs/src/stt.ts
@@ -269,6 +269,9 @@ export class SpeechStream extends stt.SpeechStream {
           ws.on('close', (code) => reject(`WebSocket returned ${code}`));
         });
 
+        // on success reset retries
+        retries = 0;
+
         await this.#runWS(ws);
       } catch (e) {
         if (retries >= maxRetry) {
@@ -279,7 +282,7 @@ export class SpeechStream extends stt.SpeechStream {
         retries++;
 
         this.#logger.warn(
-          `failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,
+          `STT: failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,
         );
         await new Promise((resolve) => setTimeout(resolve, delay * 1000));
       }
@@ -291,17 +294,6 @@ export class SpeechStream extends stt.SpeechStream {
   async #runWS(ws: WebSocket) {
     let closing = false;
 
-    const keepalive = setInterval(() => {
-      try {
-        if (ws.readyState === WebSocket.OPEN) {
-          ws.send(JSON.stringify({ message_type: 'keepalive' }));
-        }
-      } catch {
-        clearInterval(keepalive);
-        return;
-      }
-    }, 5000);
-
     const sendTask = async () => {
       const samples100Ms = Math.floor(this.#opts.sampleRate / 10);
       const stream = new AudioByteStream(
@@ -310,6 +302,7 @@ export class SpeechStream extends stt.SpeechStream {
         samples100Ms,
       );
 
+      let frame_count = 0;
       for await (const data of this.input) {
         let frames: AudioFrame[];
         if (data === SpeechStream.FLUSH_SENTINEL) {
@@ -349,6 +342,11 @@ export class SpeechStream extends stt.SpeechStream {
             );
           }
           frames = stream.write(data.data.buffer);
+          frame_count += frames.length;
+
+          if (frame_count % 100 == 0) {
+            this.#logger.debug(`STT: Sent ${frame_count} audio frames`);
+          }
 
           for (const frame of frames) {
             const audioB64 = Buffer.from(frame.data.buffer).toString('base64');
@@ -364,26 +362,36 @@ export class SpeechStream extends stt.SpeechStream {
         }
       }
 
+      this.#logger.info(`STT: Send task complete, sent ${frame_count} total frames`);
       closing = true;
     };
 
-    const wsMonitor = new Promise<void>((_, reject) =>
+    const wsMonitor = new Promise<void>((resolve, reject) =>
       ws.once('close', (code, reason) => {
+        console.log('code', code, reason);
         if (!closing) {
-          this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);
+          this.#logger.error(`STT: WebSocket closed unexpectedly with code ${code}: ${reason}`);
           reject(new Error('WebSocket closed'));
+        } else {
+          this.#logger.error(`STT: WebSocket closed normally ${code}: ${reason}`);
+          resolve();
         }
       }),
     );
 
     const listenTask = async () => {
-      await new Promise<void>((resolve) => {
+      await new Promise<void>((resolve, reject) => {
         ws.on('message', (msg) => {
-          const json = JSON.parse(msg.toString());
-          this.#processStreamEvent(json);
-
-          if (this.closed || closing) {
-            resolve();
+          try {
+            const json = JSON.parse(msg.toString());
+            this.#processStreamEvent(json);
+
+            if (this.closed || closing) {
+              resolve();
+            }
+          } catch (err) {
+            this.#logger.error(`STT: Error processing message: ${msg}`);
+            reject(err);
           }
         });
       });
@@ -392,7 +400,6 @@ export class SpeechStream extends stt.SpeechStream {
     await Promise.all([sendTask(), listenTask(), wsMonitor]);
     closing = true;
     ws.close();
-    clearInterval(keepalive);
   }
 
   #processStreamEvent(data: any) {
@@ -463,9 +470,11 @@ export class SpeechStream extends stt.SpeechStream {
       }
     } else if (messageType === 'session_started') {
       const sessionId = data.session_id || 'unknown';
-      this.#logger.info(`ElevenLabs session started with ID: ${sessionId}`);
+      this.#logger.info(`STT: ElevenLabs session started with ID: ${sessionId}`);
+    } else if (messageType === 'input_error') {
+      this.#logger.error(`STT: Input Error received: ${data.error}. We ignore this for now.`);
     } else {
-      this.#logger.warn(`Unknown message type: ${messageType}`);
+      this.#logger.warn(`STT: Unknown message type: ${messageType}`);
     }
   }
 }

From c96ea4e664f633503e9b0e44a1f6030bbf142261 Mon Sep 17 00:00:00 2001
From: simon <s.tretter@gmail.com>
Date: Thu, 13 Nov 2025 12:12:14 +0100
Subject: [PATCH 3/7] update support matrix in root README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 269868c24..3a07fdc1c 100644
--- a/README.md
+++ b/README.md
@@ -62,11 +62,11 @@ pnpm install @livekit/agents
 Currently, only the following plugins are supported:
 
 | Plugin                                                                                               | Features      |
-| ---------------------------------------------------------------------------------------------------- | ------------- |
+| ---------------------------------------------------------------------------------------------------- |---------------|
 | [@livekit/agents-plugin-openai](https://www.npmjs.com/package/@livekit/agents-plugin-openai)         | LLM, TTS, STT |
 | [@livekit/agents-plugin-google](https://www.npmjs.com/package/@livekit/agents-plugin-google)         | LLM, TTS      |
 | [@livekit/agents-plugin-deepgram](https://www.npmjs.com/package/@livekit/agents-plugin-deepgram)     | STT, TTS      |
-| [@livekit/agents-plugin-elevenlabs](https://www.npmjs.com/package/@livekit/agents-plugin-elevenlabs) | TTS           |
+| [@livekit/agents-plugin-elevenlabs](https://www.npmjs.com/package/@livekit/agents-plugin-elevenlabs) | TTS, STT      |
 | [@livekit/agents-plugin-cartesia](https://www.npmjs.com/package/@livekit/agents-plugin-cartesia)     | TTS           |
 | [@livekit/agents-plugin-neuphonic](https://www.npmjs.com/package/@livekit/agents-plugin-neuphonic)   | TTS           |
 | [@livekit/agents-plugin-resemble](https://www.npmjs.com/package/@livekit/agents-plugin-resemble)     | TTS           |

From f47dd179ad9d7aa934460f0a7b0051282f5675ca Mon Sep 17 00:00:00 2001
From: simon <s.tretter@gmail.com>
Date: Thu, 13 Nov 2025 12:12:29 +0100
Subject: [PATCH 4/7] ensure cleaning up event listeners

---
 plugins/elevenlabs/src/stt.ts | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/plugins/elevenlabs/src/stt.ts b/plugins/elevenlabs/src/stt.ts
index 5f52fc8f7..1f59fe184 100644
--- a/plugins/elevenlabs/src/stt.ts
+++ b/plugins/elevenlabs/src/stt.ts
@@ -278,6 +278,8 @@ export class SpeechStream extends stt.SpeechStream {
           throw new Error(`failed to connect to ElevenLabs after ${retries} attempts: ${e}`);
         }
 
+        ws.removeAllListeners();
+
         const delay = Math.min(retries * 5, 10);
         retries++;
 

From 05796317077c07498f8fdeecb3fd8e2041b0acf7 Mon Sep 17 00:00:00 2001
From: simon <s.tretter@gmail.com>
Date: Thu, 13 Nov 2025 16:32:20 +0100
Subject: [PATCH 5/7] added ws.ping

unfortunately it doens't help the disconnect error.

[16:30:14.475] ERROR (390804): STT: WebSocket closed unexpectedly with code 1000:
[16:30:14.475] WARN (390804): STT: failed to connect to ElevenLabs, retrying in 0 seconds: Error: WebSocket closed (1/32)
---
 plugins/elevenlabs/src/stt.ts | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/plugins/elevenlabs/src/stt.ts b/plugins/elevenlabs/src/stt.ts
index 1f59fe184..cbb4e7a72 100644
--- a/plugins/elevenlabs/src/stt.ts
+++ b/plugins/elevenlabs/src/stt.ts
@@ -296,6 +296,17 @@ export class SpeechStream extends stt.SpeechStream {
   async #runWS(ws: WebSocket) {
     let closing = false;
 
+    const keepalive = setInterval(() => {
+      try {
+        if (ws.readyState === WebSocket.OPEN) {
+          ws.ping();
+        }
+      } catch {
+        clearInterval(keepalive);
+        return;
+      }
+    }, 5000);
+
     const sendTask = async () => {
       const samples100Ms = Math.floor(this.#opts.sampleRate / 10);
       const stream = new AudioByteStream(
@@ -402,6 +413,7 @@ export class SpeechStream extends stt.SpeechStream {
     await Promise.all([sendTask(), listenTask(), wsMonitor]);
     closing = true;
     ws.close();
+    clearInterval(keepalive);
   }
 
   #processStreamEvent(data: any) {

From ac88aeebedc5a77941514292d40ccb2844e492a2 Mon Sep 17 00:00:00 2001
From: simon <s.tretter@gmail.com>
Date: Fri, 14 Nov 2025 11:34:35 +0100
Subject: [PATCH 6/7] upgrade ws to 8.16

---
 plugins/elevenlabs/package.json |  2 +-
 plugins/elevenlabs/src/stt.ts   | 10 ++++-----
 pnpm-lock.yaml                  | 40 ++-------------------------------
 3 files changed, 8 insertions(+), 44 deletions(-)

diff --git a/plugins/elevenlabs/package.json b/plugins/elevenlabs/package.json
index 12a0e3f15..a7431974f 100644
--- a/plugins/elevenlabs/package.json
+++ b/plugins/elevenlabs/package.json
@@ -44,7 +44,7 @@
     "typescript": "^5.0.0"
   },
   "dependencies": {
-    "ws": "^8.16.0"
+    "ws": "^8.18.3"
   },
   "peerDependencies": {
     "@livekit/agents": "workspace:*",
diff --git a/plugins/elevenlabs/src/stt.ts b/plugins/elevenlabs/src/stt.ts
index cbb4e7a72..d9a87fc27 100644
--- a/plugins/elevenlabs/src/stt.ts
+++ b/plugins/elevenlabs/src/stt.ts
@@ -38,7 +38,7 @@ const defaultSTTOptions: STTOptions = {
   apiKey: process.env.ELEVEN_API_KEY,
   baseURL: API_BASE_URL_V1,
   model: 'scribe_v2_realtime',
-  tagAudioEvents: true,
+  tagAudioEvents: false,
   sampleRate: 16000,
   numChannels: 1,
   commitStrategy: 'vad',
@@ -61,7 +61,7 @@ export class STT extends stt.STT {
    * @param opts.baseURL - Base URL for the API (defaults to https://api.elevenlabs.io/v1)
    * @param opts.model - Model to use: 'scribe_v1' (non-streaming), 'scribe_v2' (non-streaming), or 'scribe_v2_realtime' (streaming)
    * @param opts.languageCode - Language code for transcription (optional, auto-detected if not set)
-   * @param opts.tagAudioEvents - Whether to tag audio events like (laughter), (footsteps), etc. (defaults to true, scribe_v1 only)
+   * @param opts.tagAudioEvents - Whether to tag audio events like (laughter), (footsteps), etc. (defaults to false)
    * @param opts.sampleRate - Sample rate for audio (defaults to 16000)
    * @param opts.numChannels - Number of audio channels (defaults to 1)
    * @param opts.commitStrategy - Commit strategy: 'vad' (auto) or 'manual' (defaults to 'vad', scribe_v2_realtime only)
@@ -264,9 +264,9 @@ export class SpeechStream extends stt.SpeechStream {
 
       try {
         await new Promise((resolve, reject) => {
-          ws.on('open', resolve);
-          ws.on('error', (error) => reject(error));
-          ws.on('close', (code) => reject(`WebSocket returned ${code}`));
+          ws.once('open', resolve);
+          ws.once('error', (error) => reject(error));
+          ws.once('close', (code) => reject(`WebSocket returned ${code}`));
         });
 
         // on success reset retries
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 0ff5271e8..bacc5ef17 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -365,8 +365,8 @@ importers:
   plugins/elevenlabs:
     dependencies:
       ws:
-        specifier: ^8.16.0
-        version: 8.17.0
+        specifier: ^8.18.3
+        version: 8.18.3
     devDependencies:
       '@livekit/agents':
         specifier: workspace:*
@@ -1368,92 +1368,78 @@ packages:
     resolution: {integrity: sha512-RXwd0CgG+uPRX5YYrkzKyalt2OJYRiJQ8ED/fi1tq9WQW2jsQIn0tqrlR5l5dr/rjqq6AHAxURhj2DVjyQWSOA==}
     cpu: [arm64]
     os: [linux]
-    libc: [glibc]
 
   '@img/sharp-libvips-linux-arm@1.2.0':
     resolution: {integrity: sha512-mWd2uWvDtL/nvIzThLq3fr2nnGfyr/XMXlq8ZJ9WMR6PXijHlC3ksp0IpuhK6bougvQrchUAfzRLnbsen0Cqvw==}
     cpu: [arm]
     os: [linux]
-    libc: [glibc]
 
   '@img/sharp-libvips-linux-ppc64@1.2.0':
     resolution: {integrity: sha512-Xod/7KaDDHkYu2phxxfeEPXfVXFKx70EAFZ0qyUdOjCcxbjqyJOEUpDe6RIyaunGxT34Anf9ue/wuWOqBW2WcQ==}
     cpu: [ppc64]
     os: [linux]
-    libc: [glibc]
 
   '@img/sharp-libvips-linux-s390x@1.2.0':
     resolution: {integrity: sha512-eMKfzDxLGT8mnmPJTNMcjfO33fLiTDsrMlUVcp6b96ETbnJmd4uvZxVJSKPQfS+odwfVaGifhsB07J1LynFehw==}
     cpu: [s390x]
     os: [linux]
-    libc: [glibc]
 
   '@img/sharp-libvips-linux-x64@1.2.0':
     resolution: {integrity: sha512-ZW3FPWIc7K1sH9E3nxIGB3y3dZkpJlMnkk7z5tu1nSkBoCgw2nSRTFHI5pB/3CQaJM0pdzMF3paf9ckKMSE9Tg==}
     cpu: [x64]
     os: [linux]
-    libc: [glibc]
 
   '@img/sharp-libvips-linuxmusl-arm64@1.2.0':
     resolution: {integrity: sha512-UG+LqQJbf5VJ8NWJ5Z3tdIe/HXjuIdo4JeVNADXBFuG7z9zjoegpzzGIyV5zQKi4zaJjnAd2+g2nna8TZvuW9Q==}
     cpu: [arm64]
     os: [linux]
-    libc: [musl]
 
   '@img/sharp-libvips-linuxmusl-x64@1.2.0':
     resolution: {integrity: sha512-SRYOLR7CXPgNze8akZwjoGBoN1ThNZoqpOgfnOxmWsklTGVfJiGJoC/Lod7aNMGA1jSsKWM1+HRX43OP6p9+6Q==}
     cpu: [x64]
     os: [linux]
-    libc: [musl]
 
   '@img/sharp-linux-arm64@0.34.3':
     resolution: {integrity: sha512-QdrKe3EvQrqwkDrtuTIjI0bu6YEJHTgEeqdzI3uWJOH6G1O8Nl1iEeVYRGdj1h5I21CqxSvQp1Yv7xeU3ZewbA==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [arm64]
     os: [linux]
-    libc: [glibc]
 
   '@img/sharp-linux-arm@0.34.3':
     resolution: {integrity: sha512-oBK9l+h6KBN0i3dC8rYntLiVfW8D8wH+NPNT3O/WBHeW0OQWCjfWksLUaPidsrDKpJgXp3G3/hkmhptAW0I3+A==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [arm]
     os: [linux]
-    libc: [glibc]
 
   '@img/sharp-linux-ppc64@0.34.3':
     resolution: {integrity: sha512-GLtbLQMCNC5nxuImPR2+RgrviwKwVql28FWZIW1zWruy6zLgA5/x2ZXk3mxj58X/tszVF69KK0Is83V8YgWhLA==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [ppc64]
     os: [linux]
-    libc: [glibc]
 
   '@img/sharp-linux-s390x@0.34.3':
     resolution: {integrity: sha512-3gahT+A6c4cdc2edhsLHmIOXMb17ltffJlxR0aC2VPZfwKoTGZec6u5GrFgdR7ciJSsHT27BD3TIuGcuRT0KmQ==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [s390x]
     os: [linux]
-    libc: [glibc]
 
   '@img/sharp-linux-x64@0.34.3':
     resolution: {integrity: sha512-8kYso8d806ypnSq3/Ly0QEw90V5ZoHh10yH0HnrzOCr6DKAPI6QVHvwleqMkVQ0m+fc7EH8ah0BB0QPuWY6zJQ==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [x64]
     os: [linux]
-    libc: [glibc]
 
   '@img/sharp-linuxmusl-arm64@0.34.3':
     resolution: {integrity: sha512-vAjbHDlr4izEiXM1OTggpCcPg9tn4YriK5vAjowJsHwdBIdx0fYRsURkxLG2RLm9gyBq66gwtWI8Gx0/ov+JKQ==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [arm64]
     os: [linux]
-    libc: [musl]
 
   '@img/sharp-linuxmusl-x64@0.34.3':
     resolution: {integrity: sha512-gCWUn9547K5bwvOn9l5XGAEjVTTRji4aPTqLzGXHvIr6bIDZKNTA34seMPgM0WmSf+RYBH411VavCejp3PkOeQ==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [x64]
     os: [linux]
-    libc: [musl]
 
   '@img/sharp-wasm32@0.34.3':
     resolution: {integrity: sha512-+CyRcpagHMGteySaWos8IbnXcHgfDn7pO2fiC2slJxvNq9gDipYBN42/RagzctVRKgxATmfqOSulgZv5e1RdMg==}
@@ -1578,14 +1564,12 @@ packages:
     engines: {node: '>= 10'}
     cpu: [arm64]
     os: [linux]
-    libc: [glibc]
 
   '@livekit/rtc-node-linux-x64-gnu@0.13.13':
     resolution: {integrity: sha512-B/SgbeBRobpA5LqmDEoBJHpRXePpoF4RO4F0zJf9BdkDhOR0j77p6hD0ZiOuPTRoBzUqukpsTszp+lZnHoNmiA==}
     engines: {node: '>= 10'}
     cpu: [x64]
     os: [linux]
-    libc: [glibc]
 
   '@livekit/rtc-node-win32-x64-msvc@0.13.13':
     resolution: {integrity: sha512-ygVYV4eHczs3QdaW/p0ADhhm7InUDhFaCYk8OzzIn056ZibZPXzvPizCThZqs8VsDniA01MraZF3qhZZb8IyRg==}
@@ -1726,121 +1710,101 @@ packages:
     resolution: {integrity: sha512-3reX2fUHqN7sffBNqmEyMQVj/CKhIHZd4y631duy0hZqI8Qoqf6lTtmAKvJFYa6bhU95B1D0WgzHkmTg33In0A==}
     cpu: [arm]
     os: [linux]
-    libc: [glibc]
 
   '@rollup/rollup-linux-arm-gnueabihf@4.40.0':
     resolution: {integrity: sha512-y/qUMOpJxBMy8xCXD++jeu8t7kzjlOCkoxxajL58G62PJGBZVl/Gwpm7JK9+YvlB701rcQTzjUZ1JgUoPTnoQA==}
     cpu: [arm]
     os: [linux]
-    libc: [glibc]
 
   '@rollup/rollup-linux-arm-musleabihf@4.17.2':
     resolution: {integrity: sha512-uSqpsp91mheRgw96xtyAGP9FW5ChctTFEoXP0r5FAzj/3ZRv3Uxjtc7taRQSaQM/q85KEKjKsZuiZM3GyUivRg==}
     cpu: [arm]
     os: [linux]
-    libc: [musl]
 
   '@rollup/rollup-linux-arm-musleabihf@4.40.0':
     resolution: {integrity: sha512-GoCsPibtVdJFPv/BOIvBKO/XmwZLwaNWdyD8TKlXuqp0veo2sHE+A/vpMQ5iSArRUz/uaoj4h5S6Pn0+PdhRjg==}
     cpu: [arm]
     os: [linux]
-    libc: [musl]
 
   '@rollup/rollup-linux-arm64-gnu@4.17.2':
     resolution: {integrity: sha512-EMMPHkiCRtE8Wdk3Qhtciq6BndLtstqZIroHiiGzB3C5LDJmIZcSzVtLRbwuXuUft1Cnv+9fxuDtDxz3k3EW2A==}
     cpu: [arm64]
     os: [linux]
-    libc: [glibc]
 
   '@rollup/rollup-linux-arm64-gnu@4.40.0':
     resolution: {integrity: sha512-L5ZLphTjjAD9leJzSLI7rr8fNqJMlGDKlazW2tX4IUF9P7R5TMQPElpH82Q7eNIDQnQlAyiNVfRPfP2vM5Avvg==}
     cpu: [arm64]
     os: [linux]
-    libc: [glibc]
 
   '@rollup/rollup-linux-arm64-musl@4.17.2':
     resolution: {integrity: sha512-NMPylUUZ1i0z/xJUIx6VUhISZDRT+uTWpBcjdv0/zkp7b/bQDF+NfnfdzuTiB1G6HTodgoFa93hp0O1xl+/UbA==}
     cpu: [arm64]
     os: [linux]
-    libc: [musl]
 
   '@rollup/rollup-linux-arm64-musl@4.40.0':
     resolution: {integrity: sha512-ATZvCRGCDtv1Y4gpDIXsS+wfFeFuLwVxyUBSLawjgXK2tRE6fnsQEkE4csQQYWlBlsFztRzCnBvWVfcae/1qxQ==}
     cpu: [arm64]
     os: [linux]
-    libc: [musl]
 
   '@rollup/rollup-linux-loongarch64-gnu@4.40.0':
     resolution: {integrity: sha512-wG9e2XtIhd++QugU5MD9i7OnpaVb08ji3P1y/hNbxrQ3sYEelKJOq1UJ5dXczeo6Hj2rfDEL5GdtkMSVLa/AOg==}
     cpu: [loong64]
     os: [linux]
-    libc: [glibc]
 
   '@rollup/rollup-linux-powerpc64le-gnu@4.17.2':
     resolution: {integrity: sha512-T19My13y8uYXPw/L/k0JYaX1fJKFT/PWdXiHr8mTbXWxjVF1t+8Xl31DgBBvEKclw+1b00Chg0hxE2O7bTG7GQ==}
     cpu: [ppc64]
     os: [linux]
-    libc: [glibc]
 
   '@rollup/rollup-linux-powerpc64le-gnu@4.40.0':
     resolution: {integrity: sha512-vgXfWmj0f3jAUvC7TZSU/m/cOE558ILWDzS7jBhiCAFpY2WEBn5jqgbqvmzlMjtp8KlLcBlXVD2mkTSEQE6Ixw==}
     cpu: [ppc64]
     os: [linux]
-    libc: [glibc]
 
   '@rollup/rollup-linux-riscv64-gnu@4.17.2':
     resolution: {integrity: sha512-BOaNfthf3X3fOWAB+IJ9kxTgPmMqPPH5f5k2DcCsRrBIbWnaJCgX2ll77dV1TdSy9SaXTR5iDXRL8n7AnoP5cg==}
     cpu: [riscv64]
     os: [linux]
-    libc: [glibc]
 
   '@rollup/rollup-linux-riscv64-gnu@4.40.0':
     resolution: {integrity: sha512-uJkYTugqtPZBS3Z136arevt/FsKTF/J9dEMTX/cwR7lsAW4bShzI2R0pJVw+hcBTWF4dxVckYh72Hk3/hWNKvA==}
     cpu: [riscv64]
     os: [linux]
-    libc: [glibc]
 
   '@rollup/rollup-linux-riscv64-musl@4.40.0':
     resolution: {integrity: sha512-rKmSj6EXQRnhSkE22+WvrqOqRtk733x3p5sWpZilhmjnkHkpeCgWsFFo0dGnUGeA+OZjRl3+VYq+HyCOEuwcxQ==}
     cpu: [riscv64]
     os: [linux]
-    libc: [musl]
 
   '@rollup/rollup-linux-s390x-gnu@4.17.2':
     resolution: {integrity: sha512-W0UP/x7bnn3xN2eYMql2T/+wpASLE5SjObXILTMPUBDB/Fg/FxC+gX4nvCfPBCbNhz51C+HcqQp2qQ4u25ok6g==}
     cpu: [s390x]
     os: [linux]
-    libc: [glibc]
 
   '@rollup/rollup-linux-s390x-gnu@4.40.0':
     resolution: {integrity: sha512-SpnYlAfKPOoVsQqmTFJ0usx0z84bzGOS9anAC0AZ3rdSo3snecihbhFTlJZ8XMwzqAcodjFU4+/SM311dqE5Sw==}
     cpu: [s390x]
     os: [linux]
-    libc: [glibc]
 
   '@rollup/rollup-linux-x64-gnu@4.17.2':
     resolution: {integrity: sha512-Hy7pLwByUOuyaFC6mAr7m+oMC+V7qyifzs/nW2OJfC8H4hbCzOX07Ov0VFk/zP3kBsELWNFi7rJtgbKYsav9QQ==}
     cpu: [x64]
     os: [linux]
-    libc: [glibc]
 
   '@rollup/rollup-linux-x64-gnu@4.40.0':
     resolution: {integrity: sha512-RcDGMtqF9EFN8i2RYN2W+64CdHruJ5rPqrlYw+cgM3uOVPSsnAQps7cpjXe9be/yDp8UC7VLoCoKC8J3Kn2FkQ==}
     cpu: [x64]
     os: [linux]
-    libc: [glibc]
 
   '@rollup/rollup-linux-x64-musl@4.17.2':
     resolution: {integrity: sha512-h1+yTWeYbRdAyJ/jMiVw0l6fOOm/0D1vNLui9iPuqgRGnXA0u21gAqOyB5iHjlM9MMfNOm9RHCQ7zLIzT0x11Q==}
     cpu: [x64]
     os: [linux]
-    libc: [musl]
 
   '@rollup/rollup-linux-x64-musl@4.40.0':
     resolution: {integrity: sha512-HZvjpiUmSNx5zFgwtQAV1GaGazT2RWvqeDi0hV+AtC8unqqDSsaFjPxfsO6qPtKRRg25SisACWnJ37Yio8ttaw==}
     cpu: [x64]
     os: [linux]
-    libc: [musl]
 
   '@rollup/rollup-win32-arm64-msvc@4.17.2':
     resolution: {integrity: sha512-tmdtXMfKAjy5+IQsVtDiCfqbynAQE/TQRpWdVataHmhMb9DCoJxp9vLcCBjEQWMiUYxO1QprH/HbY9ragCEFLA==}

From 5620c708f995e3907ca5d1786498fd0829aaf0d9 Mon Sep 17 00:00:00 2001
From: simon <s.tretter@gmail.com>
Date: Mon, 17 Nov 2025 15:55:21 +0100
Subject: [PATCH 7/7] stt improvements

---
 plugins/elevenlabs/src/stt.ts | 81 ++++++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 29 deletions(-)

diff --git a/plugins/elevenlabs/src/stt.ts b/plugins/elevenlabs/src/stt.ts
index d9a87fc27..1282b915c 100644
--- a/plugins/elevenlabs/src/stt.ts
+++ b/plugins/elevenlabs/src/stt.ts
@@ -7,12 +7,15 @@ import {
   APITimeoutError,
   type AudioBuffer,
   AudioByteStream,
+  Future,
+  Task,
   log,
   mergeFrames,
   stt,
+  waitForAbort,
 } from '@livekit/agents';
 import type { AudioFrame } from '@livekit/rtc-node';
-import { WebSocket } from 'ws';
+import { type RawData, WebSocket } from 'ws';
 import type { STTAudioFormat, STTCommitStrategy, STTLanguages, STTModels } from './models.js';
 
 const API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1';
@@ -263,11 +266,12 @@ export class SpeechStream extends stt.SpeechStream {
       });
 
       try {
-        await new Promise((resolve, reject) => {
-          ws.once('open', resolve);
-          ws.once('error', (error) => reject(error));
-          ws.once('close', (code) => reject(`WebSocket returned ${code}`));
-        });
+        const connFut = new Future<void>();
+        ws.once('open', () => connFut.resolve());
+        ws.once('error', (error) => connFut.reject(error));
+        ws.once('close', (code) => connFut.reject(new Error(`WebSocket returned ${code}`)));
+
+        await connFut.await;
 
         // on success reset retries
         retries = 0;
@@ -295,6 +299,7 @@ export class SpeechStream extends stt.SpeechStream {
 
   async #runWS(ws: WebSocket) {
     let closing = false;
+    const abortController = new AbortController();
 
     const keepalive = setInterval(() => {
       try {
@@ -307,7 +312,7 @@ export class SpeechStream extends stt.SpeechStream {
       }
     }, 5000);
 
-    const sendTask = async () => {
+    const sendTask = Task.from(async (controller) => {
       const samples100Ms = Math.floor(this.#opts.sampleRate / 10);
       const stream = new AudioByteStream(
         this.#opts.sampleRate,
@@ -316,7 +321,16 @@ export class SpeechStream extends stt.SpeechStream {
       );
 
       let frame_count = 0;
-      for await (const data of this.input) {
+      while (!controller.signal.aborted) {
+        const result = await Promise.race([this.input.next(), waitForAbort(controller.signal)]);
+
+        if (result === undefined) break; // aborted
+        if (result.done) {
+          controller.abort();
+          break;
+        }
+
+        const data = result.value;
         let frames: AudioFrame[];
         if (data === SpeechStream.FLUSH_SENTINEL) {
           frames = stream.flush();
@@ -377,23 +391,26 @@ export class SpeechStream extends stt.SpeechStream {
 
       this.#logger.info(`STT: Send task complete, sent ${frame_count} total frames`);
       closing = true;
-    };
-
-    const wsMonitor = new Promise<void>((resolve, reject) =>
-      ws.once('close', (code, reason) => {
-        console.log('code', code, reason);
-        if (!closing) {
-          this.#logger.error(`STT: WebSocket closed unexpectedly with code ${code}: ${reason}`);
-          reject(new Error('WebSocket closed'));
-        } else {
-          this.#logger.error(`STT: WebSocket closed normally ${code}: ${reason}`);
-          resolve();
-        }
-      }),
-    );
+    }, abortController);
+
+    const wsMonitor = Task.from(async (controller) => {
+      const connectionClosed = new Promise<void>((resolve, reject) =>
+        ws.once('close', (code, reason) => {
+          if (!closing) {
+            this.#logger.error(`STT: WebSocket closed unexpectedly with code ${code}: ${reason}`);
+            reject(new Error('WebSocket closed'));
+          } else {
+            this.#logger.error(`STT: WebSocket closed normally ${code}: ${reason}`);
+            resolve();
+          }
+        }),
+      );
 
-    const listenTask = async () => {
-      await new Promise<void>((resolve, reject) => {
+      await Promise.race([connectionClosed, waitForAbort(controller.signal)]);
+    }, abortController);
+
+    const listenTask = Task.from(async (controller) => {
+      const listenMessage = new Promise<void>((resolve, reject) => {
         ws.on('message', (msg) => {
           try {
             const json = JSON.parse(msg.toString());
@@ -408,12 +425,18 @@ export class SpeechStream extends stt.SpeechStream {
           }
         });
       });
-    };
 
-    await Promise.all([sendTask(), listenTask(), wsMonitor]);
-    closing = true;
-    ws.close();
-    clearInterval(keepalive);
+      await Promise.race([listenMessage, waitForAbort(controller.signal)]);
+    }, abortController);
+
+    try {
+      await Promise.all([sendTask.result, listenTask.result, wsMonitor]);
+    } finally {
+      closing = true;
+      abortController.abort();
+      ws.close();
+      clearInterval(keepalive);
+    }
   }
 
   #processStreamEvent(data: any) {