From 7a98841a9f2fe838bfd5f9efcf678c83d126a6d3 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 13 Nov 2025 11:06:52 +0100 Subject: [PATCH 1/7] elevenlabs stt support including scribe v2 realtime --- plugins/elevenlabs/README.stt.md | 213 ++++++++++++++ plugins/elevenlabs/src/index.ts | 2 + plugins/elevenlabs/src/models.ts | 47 +++ plugins/elevenlabs/src/stt.ts | 471 +++++++++++++++++++++++++++++++ 4 files changed, 733 insertions(+) create mode 100644 plugins/elevenlabs/README.stt.md create mode 100644 plugins/elevenlabs/src/stt.ts diff --git a/plugins/elevenlabs/README.stt.md b/plugins/elevenlabs/README.stt.md new file mode 100644 index 000000000..095c138f1 --- /dev/null +++ b/plugins/elevenlabs/README.stt.md @@ -0,0 +1,213 @@ +# ElevenLabs STT Plugin for LiveKit Agents + +This plugin provides speech-to-text capabilities using ElevenLabs Scribe API for LiveKit agents. + +## Features + +- **Multiple Model Support**: Choose between Scribe v1, v2, and v2 realtime +- **Streaming & Non-Streaming**: Support for both batch and real-time transcription +- **Multi-Language**: Supports 35+ languages with automatic language detection +- **Audio Event Tagging**: Optional tagging of non-speech audio events (laughter, footsteps, etc.) +- **VAD Configuration**: Customizable voice activity detection for streaming mode + +## Installation + +```bash +pnpm add @livekit/agents-plugin-elevenlabs +``` + +## Supported Models + +### Scribe v1 (`scribe_v1`) +- **Type**: Non-streaming +- **Method**: HTTP POST +- **Use Case**: Batch transcription of pre-recorded audio +- **Features**: Audio event tagging, language detection + +### Scribe v2 (`scribe_v2`) +- **Type**: Non-streaming +- **Method**: HTTP POST +- **Use Case**: Improved accuracy for batch transcription +- **Features**: Enhanced model, language detection + +### Scribe v2 Realtime (`scribe_v2_realtime`) +- **Type**: Streaming +- **Method**: WebSocket +- **Use Case**: Real-time conversation transcription +- **Features**: Interim results, VAD-based segmentation, manual commit support + +## Quick Start + +### Non-Streaming (Scribe v1) + +```typescript +import { STT } from '@livekit/agents-plugin-elevenlabs'; + +const stt = new STT({ + apiKey: process.env.ELEVEN_API_KEY, // or set ELEVEN_API_KEY env var + model: 'scribe_v1', + languageCode: 'en', + tagAudioEvents: true, +}); +``` + +### Streaming (Scribe v2 Realtime) + +```typescript +import { STT } from '@livekit/agents-plugin-elevenlabs'; +import { SpeechEventType } from '@livekit/agents'; + +const stt = new STT({ + model: 'scribe_v2_realtime', // default + sampleRate: 16000, + languageCode: 'en', + commitStrategy: 'vad', // auto-commit on speech end + vadSilenceThresholdSecs: 1.0, +}); +``` + +## Configuration Options + +### Common Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `apiKey` | `string` | `process.env.ELEVEN_API_KEY` | ElevenLabs API key | +| `baseURL` | `string` | `https://api.elevenlabs.io/v1` | API base URL | +| `model` | `STTModels` | `'scribe_v1'` | Model to use | +| `languageCode` | `string` | `undefined` | Language code (auto-detected if not set) | + +### Non-Streaming Options (v1, v2) + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `tagAudioEvents` | `boolean` | `true` | Tag non-speech events like (laughter) | + +### Streaming Options (v2_realtime) + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `sampleRate` | `number` | `16000` | Audio sample rate in Hz (16000, 22050, or 44100) | +| `numChannels` | `number` | `1` | Number of audio channels | +| `commitStrategy` | `'vad' \| 'manual'` | `'vad'` | How to commit transcripts | +| `vadSilenceThresholdSecs` | `number` | `undefined` | VAD silence threshold (0.3-3.0 seconds) | +| `vadThreshold` | `number` | `undefined` | VAD threshold (0.1-0.9) | +| `minSpeechDurationMs` | `number` | `undefined` | Minimum speech duration (50-2000 ms) | +| `minSilenceDurationMs` | `number` | `undefined` | Minimum silence duration (50-2000 ms) | + +## Supported Languages + +The plugin supports 35+ languages including: + +- **English** (`en`) +- **Spanish** (`es`) +- **French** (`fr`) +- **German** (`de`) +- **Italian** (`it`) +- **Portuguese** (`pt`) +- **Polish** (`pl`) +- **Dutch** (`nl`) +- **Swedish** (`sv`) +- **Finnish** (`fi`) +- **Danish** (`da`) +- **Norwegian** (`no`) +- **Czech** (`cs`) +- **Romanian** (`ro`) +- **Slovak** (`sk`) +- **Ukrainian** (`uk`) +- **Greek** (`el`) +- **Turkish** (`tr`) +- **Russian** (`ru`) +- **Bulgarian** (`bg`) +- **Croatian** (`hr`) +- **Serbian** (`sr`) +- **Hungarian** (`hu`) +- **Lithuanian** (`lt`) +- **Latvian** (`lv`) +- **Estonian** (`et`) +- **Japanese** (`ja`) +- **Chinese** (`zh`) +- **Korean** (`ko`) +- **Hindi** (`hi`) +- **Arabic** (`ar`) +- **Persian** (`fa`) +- **Hebrew** (`he`) +- **Indonesian** (`id`) +- **Malay** (`ms`) +- **Thai** (`th`) +- **Vietnamese** (`vi`) +- **Tamil** (`ta`) +- **Urdu** (`ur`) + +## Advanced Usage + +### Custom VAD Parameters + +Fine-tune voice activity detection for your use case: + +```typescript +const stt = new STT({ + model: 'scribe_v2_realtime', + commitStrategy: 'vad', + + // Longer silence before committing (good for thoughtful speakers) + vadSilenceThresholdSecs: 2.0, + + // Higher threshold = more strict about what's considered speech + vadThreshold: 0.7, + + // Ignore very short speech bursts (reduce false positives) + minSpeechDurationMs: 200, + + // Require longer silence to end speech (reduce fragmentation) + minSilenceDurationMs: 500, +}); +``` + +### Multi-Language Support + +Let ElevenLabs auto-detect the language: + +```typescript +const stt = new STT({ + model: 'scribe_v1', + // Don't set languageCode - will auto-detect +}); + +const event = await stt.recognize(audioBuffer); +console.log('Detected language:', event.alternatives[0].language); +console.log('Text:', event.alternatives[0].text); +``` + +Or specify a language: + +```typescript +const stt = new STT({ + model: 'scribe_v2_realtime', + languageCode: 'es', // Spanish +}); +``` + +## Model Comparison + +| Feature | Scribe v1 | Scribe v2 | Scribe v2 Realtime | +|---------|-----------|-----------|-------------------| +| **Type** | Non-streaming | Non-streaming | Streaming | +| **Latency** | High (batch) | High (batch) | Low (real-time) | +| **Interim Results** | ❌ | ❌ | ✅ | +| **Audio Event Tagging** | ✅ | ❌ | ❌ | +| **VAD Configuration** | ❌ | ❌ | ✅ | +| **Manual Commit** | ❌ | ❌ | ✅ | +| **Best For** | Batch jobs with event detection | High-accuracy batch | Real-time conversations | + +## Resources + +- [ElevenLabs STT Documentation](https://elevenlabs.io/docs/api-reference/speech-to-text) +- [Scribe v2 Streaming Guide](https://elevenlabs.io/docs/cookbooks/speech-to-text/streaming) +- [LiveKit Agents Documentation](https://docs.livekit.io/agents/) + +## License + +Copyright 2025 LiveKit, Inc. + +Licensed under the Apache License, Version 2.0. diff --git a/plugins/elevenlabs/src/index.ts b/plugins/elevenlabs/src/index.ts index 66c4eeff6..29a7ec457 100644 --- a/plugins/elevenlabs/src/index.ts +++ b/plugins/elevenlabs/src/index.ts @@ -3,6 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 import { Plugin } from '@livekit/agents'; +export * from './models.js'; +export * from './stt.js'; export * from './tts.js'; class ElevenLabsPlugin extends Plugin { diff --git a/plugins/elevenlabs/src/models.ts b/plugins/elevenlabs/src/models.ts index 59e419145..573844d2f 100644 --- a/plugins/elevenlabs/src/models.ts +++ b/plugins/elevenlabs/src/models.ts @@ -21,3 +21,50 @@ export type TTSEncoding = // | 'mp3_44100_128' // | 'mp3_44100_192' 'pcm_16000' | 'pcm_22050' | 'pcm_44100'; + +export type STTModels = 'scribe_v1' | 'scribe_v2' | 'scribe_v2_realtime'; + +export type STTAudioFormat = 'pcm_16000' | 'pcm_22050' | 'pcm_44100'; + +export type STTCommitStrategy = 'vad' | 'manual'; + +export type STTLanguages = + | 'en' + | 'es' + | 'fr' + | 'de' + | 'it' + | 'pt' + | 'pl' + | 'nl' + | 'sv' + | 'fi' + | 'da' + | 'no' + | 'cs' + | 'ro' + | 'sk' + | 'uk' + | 'el' + | 'tr' + | 'ru' + | 'bg' + | 'hr' + | 'sr' + | 'hu' + | 'lt' + | 'lv' + | 'et' + | 'ja' + | 'zh' + | 'ko' + | 'hi' + | 'ar' + | 'fa' + | 'he' + | 'id' + | 'ms' + | 'th' + | 'vi' + | 'ta' + | 'ur'; diff --git a/plugins/elevenlabs/src/stt.ts b/plugins/elevenlabs/src/stt.ts new file mode 100644 index 000000000..6aedae0aa --- /dev/null +++ b/plugins/elevenlabs/src/stt.ts @@ -0,0 +1,471 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { + APIConnectionError, + APIStatusError, + APITimeoutError, + type AudioBuffer, + AudioByteStream, + log, + mergeFrames, + stt, +} from '@livekit/agents'; +import type { AudioFrame } from '@livekit/rtc-node'; +import { WebSocket } from 'ws'; +import type { STTAudioFormat, STTCommitStrategy, STTLanguages, STTModels } from './models.js'; + +const API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1'; +const AUTHORIZATION_HEADER = 'xi-api-key'; + +export interface STTOptions { + apiKey?: string; + baseURL: string; + model: STTModels; + languageCode?: STTLanguages | string; + tagAudioEvents: boolean; + sampleRate: number; + numChannels: number; + // Streaming-specific options (only used for scribe_v2_realtime) + commitStrategy: STTCommitStrategy; + vadSilenceThresholdSecs?: number; + vadThreshold?: number; + minSpeechDurationMs?: number; + minSilenceDurationMs?: number; +} + +const defaultSTTOptions: STTOptions = { + apiKey: process.env.ELEVEN_API_KEY, + baseURL: API_BASE_URL_V1, + model: 'scribe_v2_realtime', + tagAudioEvents: true, + sampleRate: 16000, + numChannels: 1, + commitStrategy: 'vad', +}; + +export class STT extends stt.STT { + #opts: STTOptions; + #logger = log(); + label = 'elevenlabs.STT'; + + /** + * Create a new instance of ElevenLabs STT. + * + * @remarks + * `apiKey` must be set to your ElevenLabs API key, either using the argument or by setting the + * `ELEVEN_API_KEY` environment variable. + * + * @param opts - Configuration options for the STT service + * @param opts.apiKey - ElevenLabs API key (defaults to ELEVEN_API_KEY env var) + * @param opts.baseURL - Base URL for the API (defaults to https://api.elevenlabs.io/v1) + * @param opts.model - Model to use: 'scribe_v1' (non-streaming), 'scribe_v2' (non-streaming), or 'scribe_v2_realtime' (streaming) + * @param opts.languageCode - Language code for transcription (optional, auto-detected if not set) + * @param opts.tagAudioEvents - Whether to tag audio events like (laughter), (footsteps), etc. (defaults to true, scribe_v1 only) + * @param opts.sampleRate - Sample rate for audio (defaults to 16000) + * @param opts.numChannels - Number of audio channels (defaults to 1) + * @param opts.commitStrategy - Commit strategy: 'vad' (auto) or 'manual' (defaults to 'vad', scribe_v2_realtime only) + * @param opts.vadSilenceThresholdSecs - VAD silence threshold in seconds, 0.3-3.0 (scribe_v2_realtime only) + * @param opts.vadThreshold - VAD threshold, 0.1-0.9 (scribe_v2_realtime only) + * @param opts.minSpeechDurationMs - Minimum speech duration in ms, 50-2000 (scribe_v2_realtime only) + * @param opts.minSilenceDurationMs - Minimum silence duration in ms, 50-2000 (scribe_v2_realtime only) + */ + constructor(opts: Partial = defaultSTTOptions) { + const mergedOpts = { ...defaultSTTOptions, ...opts }; + const isStreaming = mergedOpts.model === 'scribe_v2_realtime'; + + super({ + streaming: isStreaming, + interimResults: isStreaming, + }); + + this.#opts = mergedOpts; + + if (this.#opts.apiKey === undefined) { + throw new Error( + 'ElevenLabs API key is required, whether as an argument or as $ELEVEN_API_KEY', + ); + } + } + + #createWav(frame: AudioFrame): Buffer { + const bitsPerSample = 16; + const byteRate = (frame.sampleRate * frame.channels * bitsPerSample) / 8; + const blockAlign = (frame.channels * bitsPerSample) / 8; + + const header = Buffer.alloc(44); + header.write('RIFF', 0); + header.writeUInt32LE(36 + frame.data.byteLength, 4); + header.write('WAVE', 8); + header.write('fmt ', 12); + header.writeUInt32LE(16, 16); + header.writeUInt16LE(1, 20); + header.writeUInt16LE(frame.channels, 22); + header.writeUInt32LE(frame.sampleRate, 24); + header.writeUInt32LE(byteRate, 28); + header.writeUInt16LE(blockAlign, 32); + header.writeUInt16LE(16, 34); + header.write('data', 36); + header.writeUInt32LE(frame.data.byteLength, 40); + return Buffer.concat([header, Buffer.from(frame.data.buffer)]); + } + + async _recognize(buffer: AudioBuffer, language?: string): Promise { + if (this.#opts.model === 'scribe_v2_realtime') { + throw new Error( + 'scribe_v2_realtime requires streaming. Use stream() method instead, or use scribe_v1/scribe_v2 for non-streaming recognize()', + ); + } + + const mergedBuffer = mergeFrames(buffer); + const wavBytes = this.#createWav(mergedBuffer); + + // Create form data for the request + const form = new FormData(); + form.append('file', new Blob([wavBytes], { type: 'audio/wav' }), 'audio.wav'); + form.append('model_id', this.#opts.model); + form.append('tag_audio_events', this.#opts.tagAudioEvents.toString()); + + // Add language code if provided (either from options or recognize call) + const languageCode = language || this.#opts.languageCode; + if (languageCode) { + form.append('language_code', languageCode); + } + + try { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 30000); // 30 second timeout + + const response = await fetch(`${this.#opts.baseURL}/speech-to-text`, { + method: 'POST', + headers: { + [AUTHORIZATION_HEADER]: this.#opts.apiKey!, + }, + body: form, + signal: controller.signal, + }); + + clearTimeout(timeout); + + if (!response.ok) { + const errorText = await response.text(); + throw new APIStatusError({ + message: `ElevenLabs API error: ${response.statusText} - ${errorText}`, + options: { + statusCode: response.status, + requestId: null, + body: null, + retryable: response.status >= 500, + }, + }); + } + + const responseJson = await response.json(); + const extractedText = responseJson.text || ''; + const detectedLanguage = responseJson.language_code || languageCode || 'en'; + const words = responseJson.words || []; + + let startTime = 0; + let endTime = 0; + + if (words.length > 0) { + startTime = Math.min(...words.map((w: any) => w.start || 0)); + endTime = Math.max(...words.map((w: any) => w.end || 0)); + } + + return { + type: stt.SpeechEventType.FINAL_TRANSCRIPT, + alternatives: [ + { + text: extractedText, + language: detectedLanguage, + startTime, + endTime, + confidence: 1.0, // ElevenLabs doesn't provide confidence scores + }, + ], + }; + } catch (error) { + if (error instanceof APIStatusError) { + throw error; + } + if ((error as any).name === 'AbortError') { + throw new APITimeoutError({ + message: 'ElevenLabs API request timed out', + options: { retryable: true }, + }); + } + throw new APIConnectionError({ + message: `Failed to connect to ElevenLabs: ${error}`, + options: { retryable: true }, + }); + } + } + + updateOptions(opts: Partial) { + this.#opts = { ...this.#opts, ...opts }; + } + + stream(): SpeechStream { + if (this.#opts.model !== 'scribe_v2_realtime') { + throw new Error( + 'Streaming is only supported with scribe_v2_realtime model. For non-streaming, use recognize() method.', + ); + } + return new SpeechStream(this, this.#opts); + } +} + +export class SpeechStream extends stt.SpeechStream { + #opts: STTOptions; + #logger = log(); + #speaking = false; + #lastCommittedText = ''; + label = 'elevenlabs.SpeechStream'; + + constructor(stt: STT, opts: STTOptions) { + super(stt, opts.sampleRate); + this.#opts = opts; + this.closed = false; + } + + protected async run() { + const maxRetry = 32; + let retries = 0; + let ws: WebSocket; + + while (!this.input.closed) { + // Build WebSocket URL + const audioFormat: STTAudioFormat = `pcm_${this.#opts.sampleRate}` as STTAudioFormat; + const baseUrl = this.#opts.baseURL.replace('https://', 'wss://').replace('http://', 'ws://'); + const streamURL = new URL(`${baseUrl}/speech-to-text/realtime`); + + const params = { + model_id: this.#opts.model, + encoding: audioFormat, + sample_rate: this.#opts.sampleRate, + commit_strategy: this.#opts.commitStrategy, + vad_silence_threshold_secs: this.#opts.vadSilenceThresholdSecs, + vad_threshold: this.#opts.vadThreshold, + min_speech_duration_ms: this.#opts.minSpeechDurationMs, + min_silence_duration_ms: this.#opts.minSilenceDurationMs, + language_code: this.#opts.languageCode, + }; + + Object.entries(params).forEach(([k, v]) => { + if (v !== undefined) { + streamURL.searchParams.append(k, String(v)); + } + }); + + ws = new WebSocket(streamURL.toString(), { + headers: { [AUTHORIZATION_HEADER]: `${this.#opts.apiKey}` }, + }); + + try { + await new Promise((resolve, reject) => { + ws.on('open', resolve); + ws.on('error', (error) => reject(error)); + ws.on('close', (code) => reject(`WebSocket returned ${code}`)); + }); + + await this.#runWS(ws); + } catch (e) { + if (retries >= maxRetry) { + throw new Error(`failed to connect to ElevenLabs after ${retries} attempts: ${e}`); + } + + const delay = Math.min(retries * 5, 10); + retries++; + + this.#logger.warn( + `failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`, + ); + await new Promise((resolve) => setTimeout(resolve, delay * 1000)); + } + } + + this.closed = true; + } + + async #runWS(ws: WebSocket) { + let closing = false; + + const keepalive = setInterval(() => { + try { + if (ws.readyState === WebSocket.OPEN) { + ws.send(JSON.stringify({ message_type: 'keepalive' })); + } + } catch { + clearInterval(keepalive); + return; + } + }, 5000); + + const sendTask = async () => { + const samples100Ms = Math.floor(this.#opts.sampleRate / 10); + const stream = new AudioByteStream( + this.#opts.sampleRate, + this.#opts.numChannels, + samples100Ms, + ); + + for await (const data of this.input) { + let frames: AudioFrame[]; + if (data === SpeechStream.FLUSH_SENTINEL) { + frames = stream.flush(); + + // Send any remaining frames + for (const frame of frames) { + const audioB64 = Buffer.from(frame.data.buffer).toString('base64'); + ws.send( + JSON.stringify({ + message_type: 'input_audio_chunk', + audio_base_64: audioB64, + commit: false, + sample_rate: this.#opts.sampleRate, + }), + ); + } + + // Send commit message if using manual commit strategy + if (this.#opts.commitStrategy === 'manual') { + ws.send( + JSON.stringify({ + message_type: 'input_audio_chunk', + audio_base_64: '', + commit: true, + sample_rate: this.#opts.sampleRate, + }), + ); + } + } else { + if ( + data.sampleRate !== this.#opts.sampleRate || + data.channels !== this.#opts.numChannels + ) { + throw new Error( + `sample rate or channel count of frame does not match (expected ${this.#opts.sampleRate}/${this.#opts.numChannels}, got ${data.sampleRate}/${data.channels})`, + ); + } + frames = stream.write(data.data.buffer); + + for (const frame of frames) { + const audioB64 = Buffer.from(frame.data.buffer).toString('base64'); + ws.send( + JSON.stringify({ + message_type: 'input_audio_chunk', + audio_base_64: audioB64, + commit: false, + sample_rate: this.#opts.sampleRate, + }), + ); + } + } + } + + closing = true; + }; + + const wsMonitor = new Promise((_, reject) => + ws.once('close', (code, reason) => { + if (!closing) { + this.#logger.error(`WebSocket closed with code ${code}: ${reason}`); + reject(new Error('WebSocket closed')); + } + }), + ); + + const listenTask = async () => { + await new Promise((resolve) => { + ws.on('message', (msg) => { + const json = JSON.parse(msg.toString()); + this.#processStreamEvent(json); + + if (this.closed || closing) { + resolve(); + } + }); + }); + }; + + await Promise.all([sendTask(), listenTask(), wsMonitor]); + closing = true; + ws.close(); + clearInterval(keepalive); + } + + #processStreamEvent(data: any) { + const messageType = data.message_type; + + if (messageType === 'partial_transcript') { + const text = data.text || ''; + + // Ignore stale partial transcripts that match the last committed text + if (text && text === this.#lastCommittedText) { + return; + } + + if (text) { + // Send START_OF_SPEECH if this is the first transcript in a new segment + if (!this.#speaking) { + this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH }); + this.#speaking = true; + this.#lastCommittedText = ''; + } + + this.queue.put({ + type: stt.SpeechEventType.INTERIM_TRANSCRIPT, + alternatives: [ + { + text, + language: this.#opts.languageCode || 'en', + startTime: 0, + endTime: 0, + confidence: 1.0, + }, + ], + }); + } + } else if ( + messageType === 'committed_transcript' || + messageType === 'committed_transcript_with_timestamps' + ) { + const text = data.text || ''; + + if (text) { + // Send START_OF_SPEECH if we get a FINAL without any INTERIM first + if (!this.#speaking) { + this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH }); + } + + this.queue.put({ + type: stt.SpeechEventType.FINAL_TRANSCRIPT, + alternatives: [ + { + text, + language: this.#opts.languageCode || 'en', + startTime: 0, + endTime: 0, + confidence: 1.0, + }, + ], + }); + + // Send end of speech event + this.queue.put({ type: stt.SpeechEventType.END_OF_SPEECH }); + this.#speaking = false; + this.#lastCommittedText = text; + } else { + // Empty commit - just reset state + this.#speaking = false; + this.#lastCommittedText = ''; + } + } else if (messageType === 'session_started') { + const sessionId = data.session_id || 'unknown'; + this.#logger.info(`ElevenLabs session started with ID: ${sessionId}`); + } else { + this.#logger.warn(`Unknown message type: ${messageType}`); + } + } +} From 43b0dd1f2d09477f32ecacf924e7f0703ff3b820 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 13 Nov 2025 11:57:37 +0100 Subject: [PATCH 2/7] improve logging and error handling also merge README --- plugins/elevenlabs/README.md | 207 +++++++++++++++++++++++++++++- plugins/elevenlabs/README.stt.md | 213 ------------------------------- plugins/elevenlabs/src/stt.ts | 55 ++++---- 3 files changed, 237 insertions(+), 238 deletions(-) delete mode 100644 plugins/elevenlabs/README.stt.md diff --git a/plugins/elevenlabs/README.md b/plugins/elevenlabs/README.md index 77764172b..7fd5e59f0 100644 --- a/plugins/elevenlabs/README.md +++ b/plugins/elevenlabs/README.md @@ -3,15 +3,218 @@ SPDX-FileCopyrightText: 2024 LiveKit, Inc. SPDX-License-Identifier: Apache-2.0 --> -# ElevenLabs plugin for LiveKit Agents +# ElevenLabs Plugin for LiveKit Agents The Agents Framework is designed for building realtime, programmable participants that run on servers. Use it to create conversational, multi-modal voice agents that can see, hear, and understand. -This package contains the ElevenLabs plugin, which allows for voice synthesis. +This package contains the ElevenLabs plugin, which provides: +- **Text-to-Speech (TTS)**: High-quality voice synthesis with multiple voices and models +- **Speech-to-Text (STT)**: Real-time and batch transcription with Scribe API + Refer to the [documentation](https://docs.livekit.io/agents/overview/) for information on how to use it, or browse the [API reference](https://docs.livekit.io/agents-js/modules/plugins_agents_plugin_elevenlabs.html). See the [repository](https://github.com/livekit/agents-js) for more information about the framework as a whole. + +## Installation + +```bash +pnpm add @livekit/agents-plugin-elevenlabs +``` + +Set your ElevenLabs API key: +```bash +export ELEVEN_API_KEY=your_api_key_here +``` + +--- + +## Text-to-Speech (TTS) + +For TTS documentation, refer to the [API reference](https://docs.livekit.io/agents-js/modules/plugins_agents_plugin_elevenlabs.html). + +### Quick Example + +```typescript +import { TTS } from '@livekit/agents-plugin-elevenlabs'; + +const tts = new TTS(); +// Use tts for voice synthesis +``` + +--- + +## Speech-to-Text (STT) + +### Features + +- **Multiple Model Support**: Choose between Scribe v1, v2, and v2 realtime +- **Streaming & Non-Streaming**: Support for both batch and real-time transcription +- **Multi-Language**: Supports 35+ languages with automatic language detection +- **Audio Event Tagging**: Optional tagging of non-speech audio events (laughter, footsteps, etc.) +- **VAD Configuration**: Customizable voice activity detection for streaming mode + +### Supported Models + +#### Scribe v1 (`scribe_v1`) +- **Type**: Non-streaming +- **Method**: HTTP POST +- **Use Case**: Batch transcription of pre-recorded audio +- **Features**: Audio event tagging, language detection + +#### Scribe v2 (`scribe_v2`) +- **Type**: Non-streaming +- **Method**: HTTP POST +- **Use Case**: Improved accuracy for batch transcription +- **Features**: Enhanced model, language detection + +#### Scribe v2 Realtime (`scribe_v2_realtime`) +- **Type**: Streaming (default) +- **Method**: WebSocket +- **Use Case**: Real-time conversation transcription +- **Features**: Interim results, VAD-based segmentation, manual commit support + +### Quick Start + +#### Non-Streaming (Scribe v1) + +```typescript +import { STT } from '@livekit/agents-plugin-elevenlabs'; + +const stt = new STT({ + apiKey: process.env.ELEVEN_API_KEY, // or set ELEVEN_API_KEY env var + model: 'scribe_v1', + languageCode: 'en', + tagAudioEvents: true, +}); +``` + +#### Streaming (Scribe v2 Realtime) + +```typescript +import { STT } from '@livekit/agents-plugin-elevenlabs'; +import { SpeechEventType } from '@livekit/agents'; + +const stt = new STT({ + model: 'scribe_v2_realtime', // default + sampleRate: 16000, + languageCode: 'en', + commitStrategy: 'vad', // auto-commit on speech end + vadSilenceThresholdSecs: 1.0, +}); +``` + +### Configuration Options + +#### Common Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `apiKey` | `string` | `process.env.ELEVEN_API_KEY` | ElevenLabs API key | +| `baseURL` | `string` | `https://api.elevenlabs.io/v1` | API base URL | +| `model` | `STTModels` | `'scribe_v2_realtime'` | Model to use | +| `languageCode` | `string` | `undefined` | Language code (auto-detected if not set) | + +#### Non-Streaming Options (v1, v2) + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `tagAudioEvents` | `boolean` | `true` | Tag non-speech events like (laughter) | + +#### Streaming Options (v2_realtime) + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `sampleRate` | `number` | `16000` | Audio sample rate in Hz (16000, 22050, or 44100) | +| `numChannels` | `number` | `1` | Number of audio channels | +| `commitStrategy` | `'vad' \| 'manual'` | `'vad'` | How to commit transcripts | +| `vadSilenceThresholdSecs` | `number` | `undefined` | VAD silence threshold (0.3-3.0 seconds) | +| `vadThreshold` | `number` | `undefined` | VAD threshold (0.1-0.9) | +| `minSpeechDurationMs` | `number` | `undefined` | Minimum speech duration (50-2000 ms) | +| `minSilenceDurationMs` | `number` | `undefined` | Minimum silence duration (50-2000 ms) | + +### Supported Languages + +The STT plugin supports 35+ languages including: + +English (`en`), Spanish (`es`), French (`fr`), German (`de`), Italian (`it`), Portuguese (`pt`), Polish (`pl`), Dutch (`nl`), Swedish (`sv`), Finnish (`fi`), Danish (`da`), Norwegian (`no`), Czech (`cs`), Romanian (`ro`), Slovak (`sk`), Ukrainian (`uk`), Greek (`el`), Turkish (`tr`), Russian (`ru`), Bulgarian (`bg`), Croatian (`hr`), Serbian (`sr`), Hungarian (`hu`), Lithuanian (`lt`), Latvian (`lv`), Estonian (`et`), Japanese (`ja`), Chinese (`zh`), Korean (`ko`), Hindi (`hi`), Arabic (`ar`), Persian (`fa`), Hebrew (`he`), Indonesian (`id`), Malay (`ms`), Thai (`th`), Vietnamese (`vi`), Tamil (`ta`), Urdu (`ur`) + +### Advanced Usage + +#### Custom VAD Parameters + +Fine-tune voice activity detection for your use case: + +```typescript +const stt = new STT({ + model: 'scribe_v2_realtime', + commitStrategy: 'vad', + + // Longer silence before committing (good for thoughtful speakers) + vadSilenceThresholdSecs: 2.0, + + // Higher threshold = more strict about what's considered speech + vadThreshold: 0.7, + + // Ignore very short speech bursts (reduce false positives) + minSpeechDurationMs: 200, + + // Require longer silence to end speech (reduce fragmentation) + minSilenceDurationMs: 500, +}); +``` + +#### Multi-Language Support + +Let ElevenLabs auto-detect the language: + +```typescript +const stt = new STT({ + model: 'scribe_v1', + // Don't set languageCode - will auto-detect +}); + +const event = await stt.recognize(audioBuffer); +console.log('Detected language:', event.alternatives[0].language); +console.log('Text:', event.alternatives[0].text); +``` + +Or specify a language: + +```typescript +const stt = new STT({ + model: 'scribe_v2_realtime', + languageCode: 'es', // Spanish +}); +``` + +### Model Comparison + +| Feature | Scribe v1 | Scribe v2 | Scribe v2 Realtime | +|---------|-----------|-----------|-------------------| +| **Type** | Non-streaming | Non-streaming | Streaming | +| **Latency** | High (batch) | High (batch) | Low (real-time) | +| **Interim Results** | ❌ | ❌ | ✅ | +| **Audio Event Tagging** | ✅ | ❌ | ❌ | +| **VAD Configuration** | ❌ | ❌ | ✅ | +| **Manual Commit** | ❌ | ❌ | ✅ | +| **Best For** | Batch jobs with event detection | High-accuracy batch | Real-time conversations | + +--- + +## Resources + +- [ElevenLabs TTS Documentation](https://elevenlabs.io/docs/api-reference/text-to-speech) +- [ElevenLabs STT Documentation](https://elevenlabs.io/docs/api-reference/speech-to-text) +- [Scribe v2 Streaming Guide](https://elevenlabs.io/docs/cookbooks/speech-to-text/streaming) +- [LiveKit Agents Documentation](https://docs.livekit.io/agents/) +- [LiveKit Agents JS Repository](https://github.com/livekit/agents-js) + +## License + +Copyright 2025 LiveKit, Inc. + +Licensed under the Apache License, Version 2.0. diff --git a/plugins/elevenlabs/README.stt.md b/plugins/elevenlabs/README.stt.md deleted file mode 100644 index 095c138f1..000000000 --- a/plugins/elevenlabs/README.stt.md +++ /dev/null @@ -1,213 +0,0 @@ -# ElevenLabs STT Plugin for LiveKit Agents - -This plugin provides speech-to-text capabilities using ElevenLabs Scribe API for LiveKit agents. - -## Features - -- **Multiple Model Support**: Choose between Scribe v1, v2, and v2 realtime -- **Streaming & Non-Streaming**: Support for both batch and real-time transcription -- **Multi-Language**: Supports 35+ languages with automatic language detection -- **Audio Event Tagging**: Optional tagging of non-speech audio events (laughter, footsteps, etc.) -- **VAD Configuration**: Customizable voice activity detection for streaming mode - -## Installation - -```bash -pnpm add @livekit/agents-plugin-elevenlabs -``` - -## Supported Models - -### Scribe v1 (`scribe_v1`) -- **Type**: Non-streaming -- **Method**: HTTP POST -- **Use Case**: Batch transcription of pre-recorded audio -- **Features**: Audio event tagging, language detection - -### Scribe v2 (`scribe_v2`) -- **Type**: Non-streaming -- **Method**: HTTP POST -- **Use Case**: Improved accuracy for batch transcription -- **Features**: Enhanced model, language detection - -### Scribe v2 Realtime (`scribe_v2_realtime`) -- **Type**: Streaming -- **Method**: WebSocket -- **Use Case**: Real-time conversation transcription -- **Features**: Interim results, VAD-based segmentation, manual commit support - -## Quick Start - -### Non-Streaming (Scribe v1) - -```typescript -import { STT } from '@livekit/agents-plugin-elevenlabs'; - -const stt = new STT({ - apiKey: process.env.ELEVEN_API_KEY, // or set ELEVEN_API_KEY env var - model: 'scribe_v1', - languageCode: 'en', - tagAudioEvents: true, -}); -``` - -### Streaming (Scribe v2 Realtime) - -```typescript -import { STT } from '@livekit/agents-plugin-elevenlabs'; -import { SpeechEventType } from '@livekit/agents'; - -const stt = new STT({ - model: 'scribe_v2_realtime', // default - sampleRate: 16000, - languageCode: 'en', - commitStrategy: 'vad', // auto-commit on speech end - vadSilenceThresholdSecs: 1.0, -}); -``` - -## Configuration Options - -### Common Options - -| Option | Type | Default | Description | -|--------|------|---------|-------------| -| `apiKey` | `string` | `process.env.ELEVEN_API_KEY` | ElevenLabs API key | -| `baseURL` | `string` | `https://api.elevenlabs.io/v1` | API base URL | -| `model` | `STTModels` | `'scribe_v1'` | Model to use | -| `languageCode` | `string` | `undefined` | Language code (auto-detected if not set) | - -### Non-Streaming Options (v1, v2) - -| Option | Type | Default | Description | -|--------|------|---------|-------------| -| `tagAudioEvents` | `boolean` | `true` | Tag non-speech events like (laughter) | - -### Streaming Options (v2_realtime) - -| Option | Type | Default | Description | -|--------|------|---------|-------------| -| `sampleRate` | `number` | `16000` | Audio sample rate in Hz (16000, 22050, or 44100) | -| `numChannels` | `number` | `1` | Number of audio channels | -| `commitStrategy` | `'vad' \| 'manual'` | `'vad'` | How to commit transcripts | -| `vadSilenceThresholdSecs` | `number` | `undefined` | VAD silence threshold (0.3-3.0 seconds) | -| `vadThreshold` | `number` | `undefined` | VAD threshold (0.1-0.9) | -| `minSpeechDurationMs` | `number` | `undefined` | Minimum speech duration (50-2000 ms) | -| `minSilenceDurationMs` | `number` | `undefined` | Minimum silence duration (50-2000 ms) | - -## Supported Languages - -The plugin supports 35+ languages including: - -- **English** (`en`) -- **Spanish** (`es`) -- **French** (`fr`) -- **German** (`de`) -- **Italian** (`it`) -- **Portuguese** (`pt`) -- **Polish** (`pl`) -- **Dutch** (`nl`) -- **Swedish** (`sv`) -- **Finnish** (`fi`) -- **Danish** (`da`) -- **Norwegian** (`no`) -- **Czech** (`cs`) -- **Romanian** (`ro`) -- **Slovak** (`sk`) -- **Ukrainian** (`uk`) -- **Greek** (`el`) -- **Turkish** (`tr`) -- **Russian** (`ru`) -- **Bulgarian** (`bg`) -- **Croatian** (`hr`) -- **Serbian** (`sr`) -- **Hungarian** (`hu`) -- **Lithuanian** (`lt`) -- **Latvian** (`lv`) -- **Estonian** (`et`) -- **Japanese** (`ja`) -- **Chinese** (`zh`) -- **Korean** (`ko`) -- **Hindi** (`hi`) -- **Arabic** (`ar`) -- **Persian** (`fa`) -- **Hebrew** (`he`) -- **Indonesian** (`id`) -- **Malay** (`ms`) -- **Thai** (`th`) -- **Vietnamese** (`vi`) -- **Tamil** (`ta`) -- **Urdu** (`ur`) - -## Advanced Usage - -### Custom VAD Parameters - -Fine-tune voice activity detection for your use case: - -```typescript -const stt = new STT({ - model: 'scribe_v2_realtime', - commitStrategy: 'vad', - - // Longer silence before committing (good for thoughtful speakers) - vadSilenceThresholdSecs: 2.0, - - // Higher threshold = more strict about what's considered speech - vadThreshold: 0.7, - - // Ignore very short speech bursts (reduce false positives) - minSpeechDurationMs: 200, - - // Require longer silence to end speech (reduce fragmentation) - minSilenceDurationMs: 500, -}); -``` - -### Multi-Language Support - -Let ElevenLabs auto-detect the language: - -```typescript -const stt = new STT({ - model: 'scribe_v1', - // Don't set languageCode - will auto-detect -}); - -const event = await stt.recognize(audioBuffer); -console.log('Detected language:', event.alternatives[0].language); -console.log('Text:', event.alternatives[0].text); -``` - -Or specify a language: - -```typescript -const stt = new STT({ - model: 'scribe_v2_realtime', - languageCode: 'es', // Spanish -}); -``` - -## Model Comparison - -| Feature | Scribe v1 | Scribe v2 | Scribe v2 Realtime | -|---------|-----------|-----------|-------------------| -| **Type** | Non-streaming | Non-streaming | Streaming | -| **Latency** | High (batch) | High (batch) | Low (real-time) | -| **Interim Results** | ❌ | ❌ | ✅ | -| **Audio Event Tagging** | ✅ | ❌ | ❌ | -| **VAD Configuration** | ❌ | ❌ | ✅ | -| **Manual Commit** | ❌ | ❌ | ✅ | -| **Best For** | Batch jobs with event detection | High-accuracy batch | Real-time conversations | - -## Resources - -- [ElevenLabs STT Documentation](https://elevenlabs.io/docs/api-reference/speech-to-text) -- [Scribe v2 Streaming Guide](https://elevenlabs.io/docs/cookbooks/speech-to-text/streaming) -- [LiveKit Agents Documentation](https://docs.livekit.io/agents/) - -## License - -Copyright 2025 LiveKit, Inc. - -Licensed under the Apache License, Version 2.0. diff --git a/plugins/elevenlabs/src/stt.ts b/plugins/elevenlabs/src/stt.ts index 6aedae0aa..5f52fc8f7 100644 --- a/plugins/elevenlabs/src/stt.ts +++ b/plugins/elevenlabs/src/stt.ts @@ -269,6 +269,9 @@ export class SpeechStream extends stt.SpeechStream { ws.on('close', (code) => reject(`WebSocket returned ${code}`)); }); + // on success reset retries + retries = 0; + await this.#runWS(ws); } catch (e) { if (retries >= maxRetry) { @@ -279,7 +282,7 @@ export class SpeechStream extends stt.SpeechStream { retries++; this.#logger.warn( - `failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`, + `STT: failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`, ); await new Promise((resolve) => setTimeout(resolve, delay * 1000)); } @@ -291,17 +294,6 @@ export class SpeechStream extends stt.SpeechStream { async #runWS(ws: WebSocket) { let closing = false; - const keepalive = setInterval(() => { - try { - if (ws.readyState === WebSocket.OPEN) { - ws.send(JSON.stringify({ message_type: 'keepalive' })); - } - } catch { - clearInterval(keepalive); - return; - } - }, 5000); - const sendTask = async () => { const samples100Ms = Math.floor(this.#opts.sampleRate / 10); const stream = new AudioByteStream( @@ -310,6 +302,7 @@ export class SpeechStream extends stt.SpeechStream { samples100Ms, ); + let frame_count = 0; for await (const data of this.input) { let frames: AudioFrame[]; if (data === SpeechStream.FLUSH_SENTINEL) { @@ -349,6 +342,11 @@ export class SpeechStream extends stt.SpeechStream { ); } frames = stream.write(data.data.buffer); + frame_count += frames.length; + + if (frame_count % 100 == 0) { + this.#logger.debug(`STT: Sent ${frame_count} audio frames`); + } for (const frame of frames) { const audioB64 = Buffer.from(frame.data.buffer).toString('base64'); @@ -364,26 +362,36 @@ export class SpeechStream extends stt.SpeechStream { } } + this.#logger.info(`STT: Send task complete, sent ${frame_count} total frames`); closing = true; }; - const wsMonitor = new Promise((_, reject) => + const wsMonitor = new Promise((resolve, reject) => ws.once('close', (code, reason) => { + console.log('code', code, reason); if (!closing) { - this.#logger.error(`WebSocket closed with code ${code}: ${reason}`); + this.#logger.error(`STT: WebSocket closed unexpectedly with code ${code}: ${reason}`); reject(new Error('WebSocket closed')); + } else { + this.#logger.error(`STT: WebSocket closed normally ${code}: ${reason}`); + resolve(); } }), ); const listenTask = async () => { - await new Promise((resolve) => { + await new Promise((resolve, reject) => { ws.on('message', (msg) => { - const json = JSON.parse(msg.toString()); - this.#processStreamEvent(json); - - if (this.closed || closing) { - resolve(); + try { + const json = JSON.parse(msg.toString()); + this.#processStreamEvent(json); + + if (this.closed || closing) { + resolve(); + } + } catch (err) { + this.#logger.error(`STT: Error processing message: ${msg}`); + reject(err); } }); }); @@ -392,7 +400,6 @@ export class SpeechStream extends stt.SpeechStream { await Promise.all([sendTask(), listenTask(), wsMonitor]); closing = true; ws.close(); - clearInterval(keepalive); } #processStreamEvent(data: any) { @@ -463,9 +470,11 @@ export class SpeechStream extends stt.SpeechStream { } } else if (messageType === 'session_started') { const sessionId = data.session_id || 'unknown'; - this.#logger.info(`ElevenLabs session started with ID: ${sessionId}`); + this.#logger.info(`STT: ElevenLabs session started with ID: ${sessionId}`); + } else if (messageType === 'input_error') { + this.#logger.error(`STT: Input Error received: ${data.error}. We ignore this for now.`); } else { - this.#logger.warn(`Unknown message type: ${messageType}`); + this.#logger.warn(`STT: Unknown message type: ${messageType}`); } } } From c96ea4e664f633503e9b0e44a1f6030bbf142261 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 13 Nov 2025 12:12:14 +0100 Subject: [PATCH 3/7] update support matrix in root README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 269868c24..3a07fdc1c 100644 --- a/README.md +++ b/README.md @@ -62,11 +62,11 @@ pnpm install @livekit/agents Currently, only the following plugins are supported: | Plugin | Features | -| ---------------------------------------------------------------------------------------------------- | ------------- | +| ---------------------------------------------------------------------------------------------------- |---------------| | [@livekit/agents-plugin-openai](https://www.npmjs.com/package/@livekit/agents-plugin-openai) | LLM, TTS, STT | | [@livekit/agents-plugin-google](https://www.npmjs.com/package/@livekit/agents-plugin-google) | LLM, TTS | | [@livekit/agents-plugin-deepgram](https://www.npmjs.com/package/@livekit/agents-plugin-deepgram) | STT, TTS | -| [@livekit/agents-plugin-elevenlabs](https://www.npmjs.com/package/@livekit/agents-plugin-elevenlabs) | TTS | +| [@livekit/agents-plugin-elevenlabs](https://www.npmjs.com/package/@livekit/agents-plugin-elevenlabs) | TTS, STT | | [@livekit/agents-plugin-cartesia](https://www.npmjs.com/package/@livekit/agents-plugin-cartesia) | TTS | | [@livekit/agents-plugin-neuphonic](https://www.npmjs.com/package/@livekit/agents-plugin-neuphonic) | TTS | | [@livekit/agents-plugin-resemble](https://www.npmjs.com/package/@livekit/agents-plugin-resemble) | TTS | From f47dd179ad9d7aa934460f0a7b0051282f5675ca Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 13 Nov 2025 12:12:29 +0100 Subject: [PATCH 4/7] ensure cleaning up event listeners --- plugins/elevenlabs/src/stt.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/plugins/elevenlabs/src/stt.ts b/plugins/elevenlabs/src/stt.ts index 5f52fc8f7..1f59fe184 100644 --- a/plugins/elevenlabs/src/stt.ts +++ b/plugins/elevenlabs/src/stt.ts @@ -278,6 +278,8 @@ export class SpeechStream extends stt.SpeechStream { throw new Error(`failed to connect to ElevenLabs after ${retries} attempts: ${e}`); } + ws.removeAllListeners(); + const delay = Math.min(retries * 5, 10); retries++; From 05796317077c07498f8fdeecb3fd8e2041b0acf7 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 13 Nov 2025 16:32:20 +0100 Subject: [PATCH 5/7] added ws.ping unfortunately it doens't help the disconnect error. [16:30:14.475] ERROR (390804): STT: WebSocket closed unexpectedly with code 1000: [16:30:14.475] WARN (390804): STT: failed to connect to ElevenLabs, retrying in 0 seconds: Error: WebSocket closed (1/32) --- plugins/elevenlabs/src/stt.ts | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/plugins/elevenlabs/src/stt.ts b/plugins/elevenlabs/src/stt.ts index 1f59fe184..cbb4e7a72 100644 --- a/plugins/elevenlabs/src/stt.ts +++ b/plugins/elevenlabs/src/stt.ts @@ -296,6 +296,17 @@ export class SpeechStream extends stt.SpeechStream { async #runWS(ws: WebSocket) { let closing = false; + const keepalive = setInterval(() => { + try { + if (ws.readyState === WebSocket.OPEN) { + ws.ping(); + } + } catch { + clearInterval(keepalive); + return; + } + }, 5000); + const sendTask = async () => { const samples100Ms = Math.floor(this.#opts.sampleRate / 10); const stream = new AudioByteStream( @@ -402,6 +413,7 @@ export class SpeechStream extends stt.SpeechStream { await Promise.all([sendTask(), listenTask(), wsMonitor]); closing = true; ws.close(); + clearInterval(keepalive); } #processStreamEvent(data: any) { From ac88aeebedc5a77941514292d40ccb2844e492a2 Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 14 Nov 2025 11:34:35 +0100 Subject: [PATCH 6/7] upgrade ws to 8.16 --- plugins/elevenlabs/package.json | 2 +- plugins/elevenlabs/src/stt.ts | 10 ++++----- pnpm-lock.yaml | 40 ++------------------------------- 3 files changed, 8 insertions(+), 44 deletions(-) diff --git a/plugins/elevenlabs/package.json b/plugins/elevenlabs/package.json index 12a0e3f15..a7431974f 100644 --- a/plugins/elevenlabs/package.json +++ b/plugins/elevenlabs/package.json @@ -44,7 +44,7 @@ "typescript": "^5.0.0" }, "dependencies": { - "ws": "^8.16.0" + "ws": "^8.18.3" }, "peerDependencies": { "@livekit/agents": "workspace:*", diff --git a/plugins/elevenlabs/src/stt.ts b/plugins/elevenlabs/src/stt.ts index cbb4e7a72..d9a87fc27 100644 --- a/plugins/elevenlabs/src/stt.ts +++ b/plugins/elevenlabs/src/stt.ts @@ -38,7 +38,7 @@ const defaultSTTOptions: STTOptions = { apiKey: process.env.ELEVEN_API_KEY, baseURL: API_BASE_URL_V1, model: 'scribe_v2_realtime', - tagAudioEvents: true, + tagAudioEvents: false, sampleRate: 16000, numChannels: 1, commitStrategy: 'vad', @@ -61,7 +61,7 @@ export class STT extends stt.STT { * @param opts.baseURL - Base URL for the API (defaults to https://api.elevenlabs.io/v1) * @param opts.model - Model to use: 'scribe_v1' (non-streaming), 'scribe_v2' (non-streaming), or 'scribe_v2_realtime' (streaming) * @param opts.languageCode - Language code for transcription (optional, auto-detected if not set) - * @param opts.tagAudioEvents - Whether to tag audio events like (laughter), (footsteps), etc. (defaults to true, scribe_v1 only) + * @param opts.tagAudioEvents - Whether to tag audio events like (laughter), (footsteps), etc. (defaults to false) * @param opts.sampleRate - Sample rate for audio (defaults to 16000) * @param opts.numChannels - Number of audio channels (defaults to 1) * @param opts.commitStrategy - Commit strategy: 'vad' (auto) or 'manual' (defaults to 'vad', scribe_v2_realtime only) @@ -264,9 +264,9 @@ export class SpeechStream extends stt.SpeechStream { try { await new Promise((resolve, reject) => { - ws.on('open', resolve); - ws.on('error', (error) => reject(error)); - ws.on('close', (code) => reject(`WebSocket returned ${code}`)); + ws.once('open', resolve); + ws.once('error', (error) => reject(error)); + ws.once('close', (code) => reject(`WebSocket returned ${code}`)); }); // on success reset retries diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0ff5271e8..bacc5ef17 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -365,8 +365,8 @@ importers: plugins/elevenlabs: dependencies: ws: - specifier: ^8.16.0 - version: 8.17.0 + specifier: ^8.18.3 + version: 8.18.3 devDependencies: '@livekit/agents': specifier: workspace:* @@ -1368,92 +1368,78 @@ packages: resolution: {integrity: sha512-RXwd0CgG+uPRX5YYrkzKyalt2OJYRiJQ8ED/fi1tq9WQW2jsQIn0tqrlR5l5dr/rjqq6AHAxURhj2DVjyQWSOA==} cpu: [arm64] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-arm@1.2.0': resolution: {integrity: sha512-mWd2uWvDtL/nvIzThLq3fr2nnGfyr/XMXlq8ZJ9WMR6PXijHlC3ksp0IpuhK6bougvQrchUAfzRLnbsen0Cqvw==} cpu: [arm] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-ppc64@1.2.0': resolution: {integrity: sha512-Xod/7KaDDHkYu2phxxfeEPXfVXFKx70EAFZ0qyUdOjCcxbjqyJOEUpDe6RIyaunGxT34Anf9ue/wuWOqBW2WcQ==} cpu: [ppc64] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-s390x@1.2.0': resolution: {integrity: sha512-eMKfzDxLGT8mnmPJTNMcjfO33fLiTDsrMlUVcp6b96ETbnJmd4uvZxVJSKPQfS+odwfVaGifhsB07J1LynFehw==} cpu: [s390x] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-x64@1.2.0': resolution: {integrity: sha512-ZW3FPWIc7K1sH9E3nxIGB3y3dZkpJlMnkk7z5tu1nSkBoCgw2nSRTFHI5pB/3CQaJM0pdzMF3paf9ckKMSE9Tg==} cpu: [x64] os: [linux] - libc: [glibc] '@img/sharp-libvips-linuxmusl-arm64@1.2.0': resolution: {integrity: sha512-UG+LqQJbf5VJ8NWJ5Z3tdIe/HXjuIdo4JeVNADXBFuG7z9zjoegpzzGIyV5zQKi4zaJjnAd2+g2nna8TZvuW9Q==} cpu: [arm64] os: [linux] - libc: [musl] '@img/sharp-libvips-linuxmusl-x64@1.2.0': resolution: {integrity: sha512-SRYOLR7CXPgNze8akZwjoGBoN1ThNZoqpOgfnOxmWsklTGVfJiGJoC/Lod7aNMGA1jSsKWM1+HRX43OP6p9+6Q==} cpu: [x64] os: [linux] - libc: [musl] '@img/sharp-linux-arm64@0.34.3': resolution: {integrity: sha512-QdrKe3EvQrqwkDrtuTIjI0bu6YEJHTgEeqdzI3uWJOH6G1O8Nl1iEeVYRGdj1h5I21CqxSvQp1Yv7xeU3ZewbA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] - libc: [glibc] '@img/sharp-linux-arm@0.34.3': resolution: {integrity: sha512-oBK9l+h6KBN0i3dC8rYntLiVfW8D8wH+NPNT3O/WBHeW0OQWCjfWksLUaPidsrDKpJgXp3G3/hkmhptAW0I3+A==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm] os: [linux] - libc: [glibc] '@img/sharp-linux-ppc64@0.34.3': resolution: {integrity: sha512-GLtbLQMCNC5nxuImPR2+RgrviwKwVql28FWZIW1zWruy6zLgA5/x2ZXk3mxj58X/tszVF69KK0Is83V8YgWhLA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [ppc64] os: [linux] - libc: [glibc] '@img/sharp-linux-s390x@0.34.3': resolution: {integrity: sha512-3gahT+A6c4cdc2edhsLHmIOXMb17ltffJlxR0aC2VPZfwKoTGZec6u5GrFgdR7ciJSsHT27BD3TIuGcuRT0KmQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [s390x] os: [linux] - libc: [glibc] '@img/sharp-linux-x64@0.34.3': resolution: {integrity: sha512-8kYso8d806ypnSq3/Ly0QEw90V5ZoHh10yH0HnrzOCr6DKAPI6QVHvwleqMkVQ0m+fc7EH8ah0BB0QPuWY6zJQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] - libc: [glibc] '@img/sharp-linuxmusl-arm64@0.34.3': resolution: {integrity: sha512-vAjbHDlr4izEiXM1OTggpCcPg9tn4YriK5vAjowJsHwdBIdx0fYRsURkxLG2RLm9gyBq66gwtWI8Gx0/ov+JKQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] - libc: [musl] '@img/sharp-linuxmusl-x64@0.34.3': resolution: {integrity: sha512-gCWUn9547K5bwvOn9l5XGAEjVTTRji4aPTqLzGXHvIr6bIDZKNTA34seMPgM0WmSf+RYBH411VavCejp3PkOeQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] - libc: [musl] '@img/sharp-wasm32@0.34.3': resolution: {integrity: sha512-+CyRcpagHMGteySaWos8IbnXcHgfDn7pO2fiC2slJxvNq9gDipYBN42/RagzctVRKgxATmfqOSulgZv5e1RdMg==} @@ -1578,14 +1564,12 @@ packages: engines: {node: '>= 10'} cpu: [arm64] os: [linux] - libc: [glibc] '@livekit/rtc-node-linux-x64-gnu@0.13.13': resolution: {integrity: sha512-B/SgbeBRobpA5LqmDEoBJHpRXePpoF4RO4F0zJf9BdkDhOR0j77p6hD0ZiOuPTRoBzUqukpsTszp+lZnHoNmiA==} engines: {node: '>= 10'} cpu: [x64] os: [linux] - libc: [glibc] '@livekit/rtc-node-win32-x64-msvc@0.13.13': resolution: {integrity: sha512-ygVYV4eHczs3QdaW/p0ADhhm7InUDhFaCYk8OzzIn056ZibZPXzvPizCThZqs8VsDniA01MraZF3qhZZb8IyRg==} @@ -1726,121 +1710,101 @@ packages: resolution: {integrity: sha512-3reX2fUHqN7sffBNqmEyMQVj/CKhIHZd4y631duy0hZqI8Qoqf6lTtmAKvJFYa6bhU95B1D0WgzHkmTg33In0A==} cpu: [arm] os: [linux] - libc: [glibc] '@rollup/rollup-linux-arm-gnueabihf@4.40.0': resolution: {integrity: sha512-y/qUMOpJxBMy8xCXD++jeu8t7kzjlOCkoxxajL58G62PJGBZVl/Gwpm7JK9+YvlB701rcQTzjUZ1JgUoPTnoQA==} cpu: [arm] os: [linux] - libc: [glibc] '@rollup/rollup-linux-arm-musleabihf@4.17.2': resolution: {integrity: sha512-uSqpsp91mheRgw96xtyAGP9FW5ChctTFEoXP0r5FAzj/3ZRv3Uxjtc7taRQSaQM/q85KEKjKsZuiZM3GyUivRg==} cpu: [arm] os: [linux] - libc: [musl] '@rollup/rollup-linux-arm-musleabihf@4.40.0': resolution: {integrity: sha512-GoCsPibtVdJFPv/BOIvBKO/XmwZLwaNWdyD8TKlXuqp0veo2sHE+A/vpMQ5iSArRUz/uaoj4h5S6Pn0+PdhRjg==} cpu: [arm] os: [linux] - libc: [musl] '@rollup/rollup-linux-arm64-gnu@4.17.2': resolution: {integrity: sha512-EMMPHkiCRtE8Wdk3Qhtciq6BndLtstqZIroHiiGzB3C5LDJmIZcSzVtLRbwuXuUft1Cnv+9fxuDtDxz3k3EW2A==} cpu: [arm64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-arm64-gnu@4.40.0': resolution: {integrity: sha512-L5ZLphTjjAD9leJzSLI7rr8fNqJMlGDKlazW2tX4IUF9P7R5TMQPElpH82Q7eNIDQnQlAyiNVfRPfP2vM5Avvg==} cpu: [arm64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-arm64-musl@4.17.2': resolution: {integrity: sha512-NMPylUUZ1i0z/xJUIx6VUhISZDRT+uTWpBcjdv0/zkp7b/bQDF+NfnfdzuTiB1G6HTodgoFa93hp0O1xl+/UbA==} cpu: [arm64] os: [linux] - libc: [musl] '@rollup/rollup-linux-arm64-musl@4.40.0': resolution: {integrity: sha512-ATZvCRGCDtv1Y4gpDIXsS+wfFeFuLwVxyUBSLawjgXK2tRE6fnsQEkE4csQQYWlBlsFztRzCnBvWVfcae/1qxQ==} cpu: [arm64] os: [linux] - libc: [musl] '@rollup/rollup-linux-loongarch64-gnu@4.40.0': resolution: {integrity: sha512-wG9e2XtIhd++QugU5MD9i7OnpaVb08ji3P1y/hNbxrQ3sYEelKJOq1UJ5dXczeo6Hj2rfDEL5GdtkMSVLa/AOg==} cpu: [loong64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-powerpc64le-gnu@4.17.2': resolution: {integrity: sha512-T19My13y8uYXPw/L/k0JYaX1fJKFT/PWdXiHr8mTbXWxjVF1t+8Xl31DgBBvEKclw+1b00Chg0hxE2O7bTG7GQ==} cpu: [ppc64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-powerpc64le-gnu@4.40.0': resolution: {integrity: sha512-vgXfWmj0f3jAUvC7TZSU/m/cOE558ILWDzS7jBhiCAFpY2WEBn5jqgbqvmzlMjtp8KlLcBlXVD2mkTSEQE6Ixw==} cpu: [ppc64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-riscv64-gnu@4.17.2': resolution: {integrity: sha512-BOaNfthf3X3fOWAB+IJ9kxTgPmMqPPH5f5k2DcCsRrBIbWnaJCgX2ll77dV1TdSy9SaXTR5iDXRL8n7AnoP5cg==} cpu: [riscv64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-riscv64-gnu@4.40.0': resolution: {integrity: sha512-uJkYTugqtPZBS3Z136arevt/FsKTF/J9dEMTX/cwR7lsAW4bShzI2R0pJVw+hcBTWF4dxVckYh72Hk3/hWNKvA==} cpu: [riscv64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-riscv64-musl@4.40.0': resolution: {integrity: sha512-rKmSj6EXQRnhSkE22+WvrqOqRtk733x3p5sWpZilhmjnkHkpeCgWsFFo0dGnUGeA+OZjRl3+VYq+HyCOEuwcxQ==} cpu: [riscv64] os: [linux] - libc: [musl] '@rollup/rollup-linux-s390x-gnu@4.17.2': resolution: {integrity: sha512-W0UP/x7bnn3xN2eYMql2T/+wpASLE5SjObXILTMPUBDB/Fg/FxC+gX4nvCfPBCbNhz51C+HcqQp2qQ4u25ok6g==} cpu: [s390x] os: [linux] - libc: [glibc] '@rollup/rollup-linux-s390x-gnu@4.40.0': resolution: {integrity: sha512-SpnYlAfKPOoVsQqmTFJ0usx0z84bzGOS9anAC0AZ3rdSo3snecihbhFTlJZ8XMwzqAcodjFU4+/SM311dqE5Sw==} cpu: [s390x] os: [linux] - libc: [glibc] '@rollup/rollup-linux-x64-gnu@4.17.2': resolution: {integrity: sha512-Hy7pLwByUOuyaFC6mAr7m+oMC+V7qyifzs/nW2OJfC8H4hbCzOX07Ov0VFk/zP3kBsELWNFi7rJtgbKYsav9QQ==} cpu: [x64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-x64-gnu@4.40.0': resolution: {integrity: sha512-RcDGMtqF9EFN8i2RYN2W+64CdHruJ5rPqrlYw+cgM3uOVPSsnAQps7cpjXe9be/yDp8UC7VLoCoKC8J3Kn2FkQ==} cpu: [x64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-x64-musl@4.17.2': resolution: {integrity: sha512-h1+yTWeYbRdAyJ/jMiVw0l6fOOm/0D1vNLui9iPuqgRGnXA0u21gAqOyB5iHjlM9MMfNOm9RHCQ7zLIzT0x11Q==} cpu: [x64] os: [linux] - libc: [musl] '@rollup/rollup-linux-x64-musl@4.40.0': resolution: {integrity: sha512-HZvjpiUmSNx5zFgwtQAV1GaGazT2RWvqeDi0hV+AtC8unqqDSsaFjPxfsO6qPtKRRg25SisACWnJ37Yio8ttaw==} cpu: [x64] os: [linux] - libc: [musl] '@rollup/rollup-win32-arm64-msvc@4.17.2': resolution: {integrity: sha512-tmdtXMfKAjy5+IQsVtDiCfqbynAQE/TQRpWdVataHmhMb9DCoJxp9vLcCBjEQWMiUYxO1QprH/HbY9ragCEFLA==} From 5620c708f995e3907ca5d1786498fd0829aaf0d9 Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 17 Nov 2025 15:55:21 +0100 Subject: [PATCH 7/7] stt improvements --- plugins/elevenlabs/src/stt.ts | 81 ++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 29 deletions(-) diff --git a/plugins/elevenlabs/src/stt.ts b/plugins/elevenlabs/src/stt.ts index d9a87fc27..1282b915c 100644 --- a/plugins/elevenlabs/src/stt.ts +++ b/plugins/elevenlabs/src/stt.ts @@ -7,12 +7,15 @@ import { APITimeoutError, type AudioBuffer, AudioByteStream, + Future, + Task, log, mergeFrames, stt, + waitForAbort, } from '@livekit/agents'; import type { AudioFrame } from '@livekit/rtc-node'; -import { WebSocket } from 'ws'; +import { type RawData, WebSocket } from 'ws'; import type { STTAudioFormat, STTCommitStrategy, STTLanguages, STTModels } from './models.js'; const API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1'; @@ -263,11 +266,12 @@ export class SpeechStream extends stt.SpeechStream { }); try { - await new Promise((resolve, reject) => { - ws.once('open', resolve); - ws.once('error', (error) => reject(error)); - ws.once('close', (code) => reject(`WebSocket returned ${code}`)); - }); + const connFut = new Future(); + ws.once('open', () => connFut.resolve()); + ws.once('error', (error) => connFut.reject(error)); + ws.once('close', (code) => connFut.reject(new Error(`WebSocket returned ${code}`))); + + await connFut.await; // on success reset retries retries = 0; @@ -295,6 +299,7 @@ export class SpeechStream extends stt.SpeechStream { async #runWS(ws: WebSocket) { let closing = false; + const abortController = new AbortController(); const keepalive = setInterval(() => { try { @@ -307,7 +312,7 @@ export class SpeechStream extends stt.SpeechStream { } }, 5000); - const sendTask = async () => { + const sendTask = Task.from(async (controller) => { const samples100Ms = Math.floor(this.#opts.sampleRate / 10); const stream = new AudioByteStream( this.#opts.sampleRate, @@ -316,7 +321,16 @@ export class SpeechStream extends stt.SpeechStream { ); let frame_count = 0; - for await (const data of this.input) { + while (!controller.signal.aborted) { + const result = await Promise.race([this.input.next(), waitForAbort(controller.signal)]); + + if (result === undefined) break; // aborted + if (result.done) { + controller.abort(); + break; + } + + const data = result.value; let frames: AudioFrame[]; if (data === SpeechStream.FLUSH_SENTINEL) { frames = stream.flush(); @@ -377,23 +391,26 @@ export class SpeechStream extends stt.SpeechStream { this.#logger.info(`STT: Send task complete, sent ${frame_count} total frames`); closing = true; - }; - - const wsMonitor = new Promise((resolve, reject) => - ws.once('close', (code, reason) => { - console.log('code', code, reason); - if (!closing) { - this.#logger.error(`STT: WebSocket closed unexpectedly with code ${code}: ${reason}`); - reject(new Error('WebSocket closed')); - } else { - this.#logger.error(`STT: WebSocket closed normally ${code}: ${reason}`); - resolve(); - } - }), - ); + }, abortController); + + const wsMonitor = Task.from(async (controller) => { + const connectionClosed = new Promise((resolve, reject) => + ws.once('close', (code, reason) => { + if (!closing) { + this.#logger.error(`STT: WebSocket closed unexpectedly with code ${code}: ${reason}`); + reject(new Error('WebSocket closed')); + } else { + this.#logger.error(`STT: WebSocket closed normally ${code}: ${reason}`); + resolve(); + } + }), + ); - const listenTask = async () => { - await new Promise((resolve, reject) => { + await Promise.race([connectionClosed, waitForAbort(controller.signal)]); + }, abortController); + + const listenTask = Task.from(async (controller) => { + const listenMessage = new Promise((resolve, reject) => { ws.on('message', (msg) => { try { const json = JSON.parse(msg.toString()); @@ -408,12 +425,18 @@ export class SpeechStream extends stt.SpeechStream { } }); }); - }; - await Promise.all([sendTask(), listenTask(), wsMonitor]); - closing = true; - ws.close(); - clearInterval(keepalive); + await Promise.race([listenMessage, waitForAbort(controller.signal)]); + }, abortController); + + try { + await Promise.all([sendTask.result, listenTask.result, wsMonitor]); + } finally { + closing = true; + abortController.abort(); + ws.close(); + clearInterval(keepalive); + } } #processStreamEvent(data: any) {