diff --git a/.changeset/speko-plugin.md b/.changeset/speko-plugin.md new file mode 100644 index 000000000..149339ad8 --- /dev/null +++ b/.changeset/speko-plugin.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents-plugin-speko": patch +--- + +Add the Speko STT, LLM, and TTS plugin. diff --git a/README.md b/README.md index 6b717b80a..0e0ece119 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,7 @@ Currently, only the following plugins are supported: | [@livekit/agents-plugin-lemonslice](https://www.npmjs.com/package/@livekit/agents-plugin-lemonslice) | Avatar | | [@livekit/agents-plugin-liveavatar](https://www.npmjs.com/package/@livekit/agents-plugin-liveavatar) | Avatar | | [@livekit/agents-plugin-mistralai](https://www.npmjs.com/package/@livekit/agents-plugin-mistralai) | LLM, STT, TTS | +| [@livekit/agents-plugin-speko](https://www.npmjs.com/package/@livekit/agents-plugin-speko) | STT, LLM, TTS | | [@livekit/agents-plugin-xai](https://www.npmjs.com/package/@livekit/agents-plugin-xai) | LLM, TTS | | [@livekit/agents-plugin-phonic](https://www.npmjs.com/package/@livekit/agents-plugin-phonic) | Realtime | | [@livekit/agents-plugin-fishaudio](https://www.npmjs.com/package/@livekit/agents-plugin-fishaudio) | TTS | diff --git a/plugins/speko/README.md b/plugins/speko/README.md new file mode 100644 index 000000000..890aa99bd --- /dev/null +++ b/plugins/speko/README.md @@ -0,0 +1,101 @@ + + +# Speko plugin for LiveKit Agents + +The Agents Framework is designed for building realtime, programmable +participants that run on servers. Use it to create conversational, multi-modal +voice agents that can see, hear, and understand. + +This package contains the Speko plugin for LiveKit Agents. Speko provides a +single STT, LLM, and TTS router for voice agents, selecting providers and +handling failover server-side so your LiveKit worker does not need separate +provider credentials. + +## Installation + +```sh +pnpm add @livekit/agents @livekit/agents-plugin-speko \ + @livekit/agents-plugin-silero @livekit/rtc-node +``` + +Set `SPEKO_API_KEY` in the environment before starting your worker. +If you need a non-default Speko API host, pass `baseURL` or set +`SPEKO_BASE_URL`. + +## Usage + +```ts +import { + type JobContext, + type JobProcess, + ServerOptions, + cli, + defineAgent, + voice, +} from '@livekit/agents'; +import * as silero from '@livekit/agents-plugin-silero'; +import * as speko from '@livekit/agents-plugin-speko'; +import { fileURLToPath } from 'node:url'; + +export default defineAgent({ + prewarm: async (proc: JobProcess) => { + proc.userData.vad = await silero.VAD.load(); + }, + entry: async (ctx: JobContext) => { + const vad = ctx.proc.userData.vad as silero.VAD; + const intent = { + language: 'en-US', + optimizeFor: 'balanced', + } as const; + + const session = new voice.AgentSession({ + vad, + stt: new speko.STT({ intent }), + llm: new speko.LLM({ intent }), + tts: new speko.TTS({ intent }), + }); + + await session.start({ + agent: new voice.Agent({ + instructions: 'You are a helpful voice assistant. Be concise.', + }), + room: ctx.room, + }); + + await ctx.connect(); + + session.generateReply({ + instructions: 'Greet the user and offer your assistance.', + }); + }, +}); + +cli.runApp( + new ServerOptions({ + agent: fileURLToPath(import.meta.url), + agentName: 'speko-demo', + }), +); +``` + +`intent` is required on each raw component because Speko uses it to route every +STT, LLM, and TTS call by language, region, and optimization preference. +`voice.AgentSession` automatically wraps non-streaming STT and TTS plugins with +LiveKit stream adapters when needed. + +## Limitations + +- STT is utterance-bounded. `speko.STT` uploads one VAD-bounded WAV per + recognition call. Use `voice.AgentSession` with a VAD such as Silero, or wrap + manually with `stt.StreamAdapter` if you implement a custom STT node. +- TTS is sentence-bounded. `voice.AgentSession` wraps `speko.TTS` with a + sentence tokenizer by default, or you can wrap it manually with + `tts.StreamAdapter` if you implement a custom TTS node. +- A Speko API key is required. Pass `apiKey`, set `SPEKO_API_KEY`, or pass a + preconfigured SDK `client`. +- TTS output must be PCM or WAV. The plugin accepts `audio/pcm;rate=NNNN` and + `audio/wav`; compressed formats such as MP3 are rejected. diff --git a/plugins/speko/api-extractor.json b/plugins/speko/api-extractor.json new file mode 100644 index 000000000..1f75e0708 --- /dev/null +++ b/plugins/speko/api-extractor.json @@ -0,0 +1,20 @@ +/** + * Config file for API Extractor. For more info, please visit: https://api-extractor.com + */ +{ + "$schema": "https://developer.microsoft.com/json-schemas/api-extractor/v7/api-extractor.schema.json", + + /** + * Optionally specifies another JSON config file that this file extends from. This provides a way for + * standard settings to be shared across multiple projects. + * + * If the path starts with "./" or "../", the path is resolved relative to the folder of the file that contains + * the "extends" field. Otherwise, the first path segment is interpreted as an NPM package name, and will be + * resolved using NodeJS require(). + * + * SUPPORTED TOKENS: none + * DEFAULT VALUE: "" + */ + "extends": "../../api-extractor-shared.json", + "mainEntryPointFilePath": "./dist/index.d.ts" +} diff --git a/plugins/speko/etc/agents-plugin-speko.api.md b/plugins/speko/etc/agents-plugin-speko.api.md new file mode 100644 index 000000000..5e0ee4fbf --- /dev/null +++ b/plugins/speko/etc/agents-plugin-speko.api.md @@ -0,0 +1,128 @@ +## API Report File for "@livekit/agents-plugin-speko" + +> Do not edit this file. It is a report generated by [API Extractor](https://api-extractor.com/). + +```ts + +import { APIConnectOptions } from '@livekit/agents'; +import type { AudioBuffer as AudioBuffer_2 } from '@livekit/agents'; +import type { ChatMessage } from '@spekoai/sdk'; +import { llm } from '@livekit/agents'; +import type { OptimizeFor } from '@spekoai/sdk'; +import type { PipelineConstraints } from '@spekoai/sdk'; +import type { RoutingIntent } from '@spekoai/sdk'; +import { Speko } from '@spekoai/sdk'; +import { stt } from '@livekit/agents'; +import type { SynthesizeResult } from '@spekoai/sdk'; +import { tts } from '@livekit/agents'; + +// @public +export function chatContextToSpeko(ctx: llm.ChatContext): ChatMessage[]; + +// @public +export function decodeSynthesisResult(result: SynthesizeResult): { + pcm: Uint8Array; + sampleRate: number; + channels: number; +}; + +// @public +export function framesToWav(buffer: AudioBuffer_2): Uint8Array; + +// @public +export type Intent = RoutingIntent; + +// @public +export class LLM extends llm.LLM { + constructor(options: LLMOptions); + chat(params: { + chatCtx: llm.ChatContext; + toolCtx?: llm.ToolContext; + connOptions?: APIConnectOptions; + parallelToolCalls?: boolean; + toolChoice?: llm.ToolChoice; + extraKwargs?: Record; + }): llm.LLMStream; + label(): string; + get model(): string; + get provider(): string; +} + +// @public +export interface LLMOptions extends SpekoClientOptions { + constraints?: PipelineConstraints; + intent: Intent; + maxTokens?: number; + temperature?: number; +} + +export { OptimizeFor } + +// @public +export function parseWav(bytes: Uint8Array): { + pcm: Uint8Array; + sampleRate: number; + channels: number; +}; + +// @public +export function pcmSampleRateFromContentType(contentType: string, fallback: number): number; + +// @public +export interface SpekoClientOptions { + apiKey?: string; + baseURL?: string; + client?: Speko; + timeout?: number; +} + +// @public +export class SpekoPluginError extends Error { + constructor(message: string, code: string); + readonly code: string; +} + +// @public +export class STT extends stt.STT { + constructor(options: STTOptions); + label: string; + get model(): string; + get provider(): string; + protected _recognize(frame: AudioBuffer_2, abortSignal?: AbortSignal): Promise; + stream(_options?: { + connOptions?: APIConnectOptions; + }): stt.SpeechStream; +} + +// @public +export interface STTOptions extends SpekoClientOptions { + constraints?: PipelineConstraints; + intent: Intent; + keywords?: readonly string[]; +} + +// @public +export class TTS extends tts.TTS { + constructor(options: TTSOptions); + label: string; + get model(): string; + get provider(): string; + stream(_options?: { + connOptions?: APIConnectOptions; + }): tts.SynthesizeStream; + synthesize(text: string, connOptions?: APIConnectOptions, abortSignal?: AbortSignal): tts.ChunkedStream; +} + +// @public +export interface TTSOptions extends SpekoClientOptions { + constraints?: PipelineConstraints; + intent: Intent; + sampleRate?: number; + speed?: number; + voice?: string; +} + +// @public +export function validateIntent(intent: Intent): void; + +``` diff --git a/plugins/speko/package.json b/plugins/speko/package.json new file mode 100644 index 000000000..e07b13d17 --- /dev/null +++ b/plugins/speko/package.json @@ -0,0 +1,52 @@ +{ + "name": "@livekit/agents-plugin-speko", + "version": "1.4.4", + "description": "Speko plugin for LiveKit Node Agents", + "main": "dist/index.js", + "require": "dist/index.cjs", + "types": "dist/index.d.ts", + "exports": { + "import": { + "types": "./dist/index.d.ts", + "default": "./dist/index.js" + }, + "require": { + "types": "./dist/index.d.cts", + "default": "./dist/index.cjs" + } + }, + "author": "LiveKit", + "type": "module", + "repository": "git@github.com:livekit/agents-js.git", + "license": "Apache-2.0", + "files": [ + "dist", + "src", + "README.md" + ], + "scripts": { + "build": "tsup --onSuccess \"pnpm build:types\"", + "build:types": "tsc --declaration --emitDeclarationOnly && node ../../scripts/copyDeclarationOutput.js", + "clean": "rm -rf dist", + "clean:build": "pnpm clean && pnpm build", + "lint": "eslint -f unix \"src/**/*.{ts,js}\"", + "api:check": "api-extractor run --typescript-compiler-folder ../../node_modules/typescript", + "api:update": "api-extractor run --local --typescript-compiler-folder ../../node_modules/typescript --verbose" + }, + "devDependencies": { + "@livekit/agents": "workspace:*", + "@livekit/agents-plugin-silero": "workspace:*", + "@livekit/agents-plugins-test": "workspace:*", + "@livekit/rtc-node": "catalog:", + "@microsoft/api-extractor": "^7.35.0", + "tsup": "^8.3.5", + "typescript": "^5.0.0" + }, + "dependencies": { + "@spekoai/sdk": "^0.4.1" + }, + "peerDependencies": { + "@livekit/agents": "workspace:*", + "@livekit/rtc-node": "catalog:" + } +} diff --git a/plugins/speko/src/audio.ts b/plugins/speko/src/audio.ts new file mode 100644 index 000000000..14f034d95 --- /dev/null +++ b/plugins/speko/src/audio.ts @@ -0,0 +1,154 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { AudioBuffer } from '@livekit/agents'; +import { AudioFrame, combineAudioFrames } from '@livekit/rtc-node'; + +const WAV_HEADER_BYTES = 44; +const PCM_FORMAT = 1; +const BITS_PER_SAMPLE = 16; + +/** + * Encode a LiveKit `AudioBuffer` (single frame or array) into a standard + * PCM16 WAV byte stream suitable for uploading to the Speko `/v1/transcribe` + * endpoint (which defaults to `audio/wav`). + * + * v1 constraint: mono only. Multi-channel frames throw so that a confusing + * downstream routing failure turns into a clear error at the plugin boundary. + * + * @public + */ +export function framesToWav(buffer: AudioBuffer): Uint8Array { + const merged = combineAudioFrames(buffer); + if (merged.channels !== 1) { + throw new Error( + `speko.STT: expected mono audio (1 channel), got ${merged.channels}. ` + + `Configure your LiveKit AgentSession to pass mono audio or pre-mix ` + + `upstream of the STT.`, + ); + } + + const pcm = merged.data; + const dataByteLength = pcm.byteLength; + const totalByteLength = WAV_HEADER_BYTES + dataByteLength; + const out = new Uint8Array(totalByteLength); + const view = new DataView(out.buffer); + const byteRate = (merged.sampleRate * merged.channels * BITS_PER_SAMPLE) / 8; + const blockAlign = (merged.channels * BITS_PER_SAMPLE) / 8; + + writeAscii(out, 0, 'RIFF'); + view.setUint32(4, totalByteLength - 8, true); + writeAscii(out, 8, 'WAVE'); + writeAscii(out, 12, 'fmt '); + view.setUint32(16, 16, true); + view.setUint16(20, PCM_FORMAT, true); + view.setUint16(22, merged.channels, true); + view.setUint32(24, merged.sampleRate, true); + view.setUint32(28, byteRate, true); + view.setUint16(32, blockAlign, true); + view.setUint16(34, BITS_PER_SAMPLE, true); + writeAscii(out, 36, 'data'); + view.setUint32(40, dataByteLength, true); + + const pcmBytes = new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength); + out.set(pcmBytes, WAV_HEADER_BYTES); + + return out; +} + +/** + * Parse a PCM16 WAV byte stream, returning `{ pcm, sampleRate, channels }`. + * Used by the TTS path to unwrap a WAV-formatted proxy response into raw + * samples that can be fed into `AudioByteStream`. + * + * Only the minimal subset of the WAV spec we need: PCM format, 16-bit samples, + * a `fmt ` chunk and a `data` chunk in that order. Non-conforming inputs throw. + * + * @public + */ +export function parseWav(bytes: Uint8Array): { + pcm: Uint8Array; + sampleRate: number; + channels: number; +} { + if (bytes.byteLength < WAV_HEADER_BYTES) { + throw new Error(`speko.TTS: WAV response too small (${bytes.byteLength} bytes)`); + } + const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength); + if (readAscii(bytes, 0, 4) !== 'RIFF' || readAscii(bytes, 8, 4) !== 'WAVE') { + throw new Error('speko.TTS: not a RIFF/WAVE stream'); + } + if (readAscii(bytes, 12, 4) !== 'fmt ') { + throw new Error('speko.TTS: missing `fmt ` chunk'); + } + const audioFormat = view.getUint16(20, true); + if (audioFormat !== PCM_FORMAT) { + throw new Error(`speko.TTS: unsupported WAV format ${audioFormat}, expected PCM (1)`); + } + const channels = view.getUint16(22, true); + const sampleRate = view.getUint32(24, true); + const bitsPerSample = view.getUint16(34, true); + if (bitsPerSample !== BITS_PER_SAMPLE) { + throw new Error(`speko.TTS: unsupported WAV bit depth ${bitsPerSample}, expected 16`); + } + + const fmtChunkSize = view.getUint32(16, true); + let cursor = 20 + fmtChunkSize; + while (cursor + 8 <= bytes.byteLength) { + const chunkId = readAscii(bytes, cursor, 4); + const chunkSize = view.getUint32(cursor + 4, true); + const chunkStart = cursor + 8; + if (chunkId === 'data') { + const pcm = bytes.subarray(chunkStart, chunkStart + chunkSize); + return { pcm, sampleRate, channels }; + } + cursor = chunkStart + chunkSize; + } + throw new Error('speko.TTS: WAV stream missing `data` chunk'); +} + +/** + * Parse the `rate` parameter from a `audio/pcm;rate=NNNN` content type, which + * is what Cartesia returns via the Speko proxy. Falls back to the supplied + * default when the rate is missing or unparseable. + * + * @public + */ +export function pcmSampleRateFromContentType(contentType: string, fallback: number): number { + const match = contentType.match(/rate=(\d+)/i); + if (!match || match[1] === undefined) return fallback; + const rate = parseInt(match[1], 10); + return Number.isFinite(rate) && rate > 0 ? rate : fallback; +} + +function writeAscii(buf: Uint8Array, offset: number, text: string): void { + for (let i = 0; i < text.length; i++) { + buf[offset + i] = text.charCodeAt(i); + } +} + +function readAscii(buf: Uint8Array, offset: number, length: number): string { + let out = ''; + for (let i = 0; i < length; i++) { + out += String.fromCharCode(buf[offset + i] ?? 0); + } + return out; +} + +/** + * Build a canned `AudioFrame` for tests. Exported for use from spec files - + * the plugin's runtime code never calls this directly. + */ +export function createTestFrame(options: { + samples: Int16Array; + sampleRate: number; + channels?: number; +}): AudioFrame { + const channels = options.channels ?? 1; + return new AudioFrame( + options.samples, + options.sampleRate, + channels, + options.samples.length / channels, + ); +} diff --git a/plugins/speko/src/client.test.ts b/plugins/speko/src/client.test.ts new file mode 100644 index 000000000..7ca0f3162 --- /dev/null +++ b/plugins/speko/src/client.test.ts @@ -0,0 +1,24 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { Speko } from '@spekoai/sdk'; +import { afterEach, describe, expect, it, vi } from 'vitest'; +import { createSpekoClient } from './client.js'; + +afterEach(() => { + vi.unstubAllEnvs(); +}); + +describe('createSpekoClient', () => { + it('uses a provided SDK client', () => { + const client = {} as Speko; + + expect(createSpekoClient({ client })).toBe(client); + }); + + it('requires an API key when no SDK client is provided', () => { + vi.stubEnv('SPEKO_API_KEY', ''); + + expect(() => createSpekoClient({})).toThrow(/Speko API key is required/); + }); +}); diff --git a/plugins/speko/src/client.ts b/plugins/speko/src/client.ts new file mode 100644 index 000000000..16c514b84 --- /dev/null +++ b/plugins/speko/src/client.ts @@ -0,0 +1,40 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { Speko } from '@spekoai/sdk'; + +/** + * Shared Speko SDK client options used by the plugin components. + * + * Mirrors the provider-plugin convention used across LiveKit Agents: + * pass `apiKey`/`baseURL` directly, or pass a preconfigured `client` + * for tests and advanced connection handling. + * + * @public + */ +export interface SpekoClientOptions { + /** Preconfigured Speko SDK client. Takes precedence over `apiKey` and `baseURL`. */ + client?: Speko; + /** Speko API key. Defaults to `SPEKO_API_KEY`. */ + apiKey?: string; + /** Speko API base URL. Defaults to the SDK default, or `SPEKO_BASE_URL` when set. */ + baseURL?: string; + /** Request timeout in milliseconds for the SDK client. */ + timeout?: number; +} + +export function createSpekoClient(options: SpekoClientOptions): Speko { + if (options.client) return options.client; + + const apiKey = options.apiKey ?? process.env.SPEKO_API_KEY; + if (!apiKey) { + throw new Error('Speko API key is required, whether as an argument or as $SPEKO_API_KEY'); + } + + const baseUrl = options.baseURL ?? process.env.SPEKO_BASE_URL; + return new Speko({ + apiKey, + ...(baseUrl !== undefined && { baseUrl }), + ...(options.timeout !== undefined && { timeout: options.timeout }), + }); +} diff --git a/plugins/speko/src/index.test.ts b/plugins/speko/src/index.test.ts new file mode 100644 index 000000000..a520ef071 --- /dev/null +++ b/plugins/speko/src/index.test.ts @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { afterAll, beforeAll, describe, expect, it, vi } from 'vitest'; + +describe('public exports', () => { + let publicExports: Record; + + beforeAll(async () => { + vi.stubGlobal('__PACKAGE_NAME__', '@livekit/agents-plugin-speko'); + vi.stubGlobal('__PACKAGE_VERSION__', '0.0.0-test'); + publicExports = (await import('./index.js')) as unknown as Record; + }); + + afterAll(() => { + vi.unstubAllGlobals(); + }); + + it('exposes LiveKit-style namespace constructors', () => { + expect(publicExports.LLM).toBeTypeOf('function'); + expect(publicExports.STT).toBeTypeOf('function'); + expect(publicExports.TTS).toBeTypeOf('function'); + }); + + it('does not expose draft Speko-prefixed component aliases', () => { + expect(publicExports.SpekoLLM).toBeUndefined(); + expect(publicExports.SpekoSTT).toBeUndefined(); + expect(publicExports.SpekoTTS).toBeUndefined(); + }); + + it('does not expose a Speko-specific component factory', () => { + expect(publicExports.createSpekoComponents).toBeUndefined(); + }); +}); diff --git a/plugins/speko/src/index.ts b/plugins/speko/src/index.ts new file mode 100644 index 000000000..db118b18d --- /dev/null +++ b/plugins/speko/src/index.ts @@ -0,0 +1,29 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { Plugin } from '@livekit/agents'; + +/** + * Speko STT, LLM, and TTS plugin for LiveKit Agents. + * + * @packageDocumentation + */ +export { framesToWav, parseWav, pcmSampleRateFromContentType } from './audio.js'; +export { type SpekoClientOptions } from './client.js'; +export type { Intent, OptimizeFor } from './intent.js'; +export { validateIntent } from './intent.js'; +export { chatContextToSpeko, LLM, SpekoPluginError, type LLMOptions } from './llm.js'; +export { STT, type STTOptions } from './stt.js'; +export { decodeSynthesisResult, TTS, type TTSOptions } from './tts.js'; + +class SpekoPlugin extends Plugin { + constructor() { + super({ + title: 'speko', + version: __PACKAGE_VERSION__, + package: __PACKAGE_NAME__, + }); + } +} + +Plugin.registerPlugin(new SpekoPlugin()); diff --git a/plugins/speko/src/intent.ts b/plugins/speko/src/intent.ts new file mode 100644 index 000000000..553ebc99a --- /dev/null +++ b/plugins/speko/src/intent.ts @@ -0,0 +1,35 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { OptimizeFor, RoutingIntent } from '@spekoai/sdk'; + +/** + * Routing hint passed to every Speko proxy call the plugin makes. Mirrors + * `@spekoai/sdk`'s `RoutingIntent` so that callers can pass a value they got + * from the SDK directly without a type detour. + * + * @public + */ +export type Intent = RoutingIntent; + +export type { OptimizeFor }; + +const OPTIMIZE_FOR: ReadonlySet = new Set(['balanced', 'accuracy', 'latency', 'cost']); + +/** + * Validate an {@link Intent} at construction time so that a broken routing + * hint throws when the plugin is created, not deep inside the first STT call. + * + * @public + */ +export function validateIntent(intent: Intent): void { + if (!intent.language || typeof intent.language !== 'string') { + throw new Error('SpekoPlugin: intent.language is required (BCP-47 tag)'); + } + if (intent.optimizeFor !== undefined && !OPTIMIZE_FOR.has(intent.optimizeFor)) { + throw new Error( + `SpekoPlugin: unknown optimizeFor "${intent.optimizeFor}". ` + + `Expected one of: ${[...OPTIMIZE_FOR].join(', ')}.`, + ); + } +} diff --git a/plugins/speko/src/llm.ts b/plugins/speko/src/llm.ts new file mode 100644 index 000000000..a7b533394 --- /dev/null +++ b/plugins/speko/src/llm.ts @@ -0,0 +1,428 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS, llm, log } from '@livekit/agents'; +import type { + ChatTool, + ChatToolChoice, + PipelineConstraints, + Speko, + ChatMessage as SpekoChatMessage, +} from '@spekoai/sdk'; +import { type SpekoClientOptions, createSpekoClient } from './client.js'; +import { type Intent, validateIntent } from './intent.js'; + +/** + * Error thrown by the Speko plugin when a provider or configuration failure is + * surfaced through the LiveKit model interface. + * + * @public + */ +export class SpekoPluginError extends Error { + /** Machine-readable Speko plugin error code. */ + readonly code: string; + + constructor(message: string, code: string) { + super(message); + this.name = 'SpekoPluginError'; + this.code = code; + } +} + +/** + * Options for the Speko LLM component. + * + * @public + */ +export interface LLMOptions extends SpekoClientOptions { + /** Routing intent sent with every completion request. */ + intent: Intent; + /** Forwarded to the proxy; defaults to the upstream model's default. */ + temperature?: number; + /** Forwarded to the proxy; defaults to the upstream model's default. */ + maxTokens?: number; + /** Optional allow-list constraints. */ + constraints?: PipelineConstraints; +} + +/** + * LiveKit Agents LLM plugin that delegates completion to the Speko proxy + * (`POST /v1/complete`). The router picks the best LLM provider per intent + * and fails over automatically. + * + * Each `.chat()` call streams text deltas as the proxy emits them, and yields + * tool calls at the end when the model invokes tools. + * + * @public + */ +export class LLM extends llm.LLM { + readonly #speko: Speko; + readonly #intent: Intent; + readonly #temperature?: number; + readonly #maxTokens?: number; + readonly #constraints: PipelineConstraints | undefined; + + constructor(options: LLMOptions) { + super(); + validateIntent(options.intent); + this.#speko = createSpekoClient(options); + this.#intent = options.intent; + this.#temperature = options.temperature; + this.#maxTokens = options.maxTokens; + this.#constraints = options.constraints; + } + + /** Human-readable model label used by LiveKit metrics and logs. */ + override label(): string { + return 'speko.LLM'; + } + + /** Provider identifier reported to LiveKit metrics. */ + override get provider(): string { + return 'speko'; + } + + /** Model identifier reported to LiveKit metrics. */ + override get model(): string { + return 'speko-router'; + } + + /** + * Create a LiveKit LLM stream for the provided chat context and runtime + * tools. + */ + override chat(params: { + chatCtx: llm.ChatContext; + toolCtx?: llm.ToolContext; + connOptions?: APIConnectOptions; + parallelToolCalls?: boolean; + toolChoice?: llm.ToolChoice; + extraKwargs?: Record; + }): llm.LLMStream { + return new LLMStream(this, { + chatCtx: params.chatCtx, + toolCtx: params.toolCtx, + toolChoice: params.toolChoice, + parallelToolCalls: params.parallelToolCalls, + connOptions: params.connOptions ?? DEFAULT_API_CONNECT_OPTIONS, + speko: this.#speko, + intent: this.#intent, + temperature: this.#temperature, + maxTokens: this.#maxTokens, + constraints: this.#constraints, + }); + } +} + +interface LLMStreamArgs { + chatCtx: llm.ChatContext; + toolCtx?: llm.ToolContext; + toolChoice?: llm.ToolChoice; + parallelToolCalls?: boolean; + connOptions: APIConnectOptions; + speko: Speko; + intent: Intent; + temperature?: number; + maxTokens?: number; + constraints?: PipelineConstraints; +} + +class LLMStream extends llm.LLMStream { + readonly #speko: Speko; + readonly #intent: Intent; + readonly #temperature?: number; + readonly #maxTokens?: number; + readonly #constraints: PipelineConstraints | undefined; + readonly #toolChoice: llm.ToolChoice | undefined; + readonly #parallelToolCalls: boolean | undefined; + + constructor(parent: LLM, args: LLMStreamArgs) { + super(parent, { + chatCtx: args.chatCtx, + toolCtx: args.toolCtx, + connOptions: args.connOptions, + }); + this.#speko = args.speko; + this.#intent = args.intent; + this.#temperature = args.temperature; + this.#maxTokens = args.maxTokens; + this.#constraints = args.constraints; + this.#toolChoice = args.toolChoice; + this.#parallelToolCalls = args.parallelToolCalls; + } + + protected async run(): Promise { + // Diagnostic logging mirrors speko.TTS: the LiveKit framework consumes + // an LLMStream silently if `run()` returns without ever calling + // `queue.put()`, so without these logs the symptom is a session that + // emits "Creating speech handle" and then nothing - no error, no audio. + // Grep the worker container for `[speko.LLM]` to see the per-turn timeline. + const logger = log(); + const requestId = crypto.randomUUID(); + const t0 = Date.now(); + + const messages = chatContextToSpeko(this.chatCtx); + if (messages.length === 0) { + logger.error( + { requestId, chatCtxItems: this.chatCtx.items.length }, + '[speko.LLM] complete:invalid-context', + ); + throw new SpekoPluginError( + 'speko.LLM: ChatContext produced no convertible messages', + 'INVALID_CONTEXT', + ); + } + + const tools = toolCtxToSpekoTools(this.toolCtx); + + logger.info( + { + requestId, + messageCount: messages.length, + lastRole: messages[messages.length - 1]?.role, + language: this.#intent.language, + optimizeFor: this.#intent.optimizeFor, + constraints: this.#constraints, + toolCount: tools?.length ?? 0, + }, + '[speko.LLM] complete:start', + ); + + const completeParams = { + messages, + intent: { + language: this.#intent.language, + ...(this.#intent.region !== undefined && { region: this.#intent.region }), + ...(this.#intent.optimizeFor !== undefined && { + optimizeFor: this.#intent.optimizeFor, + }), + }, + ...(this.#temperature !== undefined && { temperature: this.#temperature }), + ...(this.#maxTokens !== undefined && { maxTokens: this.#maxTokens }), + ...(this.#constraints !== undefined && { constraints: this.#constraints }), + ...(tools !== undefined && { tools }), + ...(this.#toolChoice !== undefined && { + toolChoice: this.#toolChoice as ChatToolChoice, + }), + ...(this.#parallelToolCalls !== undefined && { + parallelToolCalls: this.#parallelToolCalls, + }), + }; + + let done: + | { + text: string; + provider: string; + model: string; + usage: { promptTokens: number; completionTokens: number }; + failoverCount: number; + toolCalls?: Array<{ id: string; name: string; args: string }>; + } + | undefined; + let streamedTextLength = 0; + try { + for await (const event of this.#speko.completeStream( + completeParams, + this.abortController.signal, + )) { + if (event.type === 'delta') { + streamedTextLength += event.text.length; + this.queue.put({ + id: crypto.randomUUID(), + delta: { + role: 'assistant', + content: event.text, + }, + }); + } else if (event.type === 'done') { + done = event; + } else if (event.type === 'error') { + throw new SpekoPluginError(event.error, event.code); + } + } + } catch (err) { + // VAD-triggered abort is normal mid-utterance: the framework calls + // `abortController.abort()` when it detects new user speech, which + // cancels the in-flight /v1/complete request. Returning cleanly + // lets the session continue with the next turn. Without this catch, + // the AbortError propagates as a fatal `llm_error` and the entire + // AgentSession closes. + if (this.abortController.signal.aborted) { + logger.info({ requestId, elapsedMs: Date.now() - t0 }, '[speko.LLM] complete:aborted'); + return; + } + logger.error( + { + requestId, + elapsedMs: Date.now() - t0, + error: err instanceof Error ? err.message : String(err), + }, + '[speko.LLM] complete:error', + ); + throw err; + } + + if (!done) { + throw new SpekoPluginError( + 'speko.LLM: complete stream ended without a done event', + 'STREAM_ENDED', + ); + } + + const toolCalls = + done.toolCalls && done.toolCalls.length > 0 + ? done.toolCalls.map((tc) => + llm.FunctionCall.create({ callId: tc.id, name: tc.name, args: tc.args }), + ) + : undefined; + + logger.info( + { + requestId, + elapsedMs: Date.now() - t0, + provider: done.provider, + model: done.model, + textLength: done.text?.length ?? 0, + streamedTextLength, + toolCallCount: toolCalls?.length ?? 0, + failoverCount: done.failoverCount, + promptTokens: done.usage.promptTokens, + completionTokens: done.usage.completionTokens, + }, + '[speko.LLM] complete:response', + ); + + // Empty completion (no text AND no tool calls) is a router-side fault + // we don't want to swallow. Without this check the framework consumes + // a content-less assistant delta, never invokes TTS, and the session + // appears frozen with no error. Throwing here surfaces the failure to + // the AgentSession's Error handler so it's visible in worker logs. + const hasText = typeof done.text === 'string' && done.text.length > 0; + if (!hasText && toolCalls === undefined) { + logger.error( + { + requestId, + elapsedMs: Date.now() - t0, + provider: done.provider, + model: done.model, + }, + '[speko.LLM] complete:empty-result', + ); + throw new SpekoPluginError( + `speko.LLM: ${done.provider}/${done.model} returned no text and no tool calls`, + 'EMPTY_COMPLETION', + ); + } + + if (toolCalls !== undefined) { + this.queue.put({ + id: crypto.randomUUID(), + delta: { + role: 'assistant', + toolCalls, + }, + }); + } + + this.queue.put({ + id: crypto.randomUUID(), + delta: { + role: 'assistant', + }, + usage: { + promptTokens: done.usage.promptTokens, + completionTokens: done.usage.completionTokens, + promptCachedTokens: 0, + totalTokens: done.usage.promptTokens + done.usage.completionTokens, + }, + }); + + logger.info( + { + requestId, + contentLength: hasText ? done.text.length : 0, + toolCallCount: toolCalls?.length ?? 0, + }, + '[speko.LLM] queue:put', + ); + } +} + +/** + * Convert a LiveKit `ToolContext` into the SDK's `ChatTool[]` shape. Returns + * `undefined` when there are no tools so the proxy receives a clean payload. + * Schemas are emitted as legacy (non-strict) JSON Schema; the proxy applies + * provider-specific strict-mode adjustments. + */ +function toolCtxToSpekoTools(toolCtx: llm.ToolContext | undefined): ChatTool[] | undefined { + if (!toolCtx) return undefined; + const entries = Object.entries(toolCtx); + if (entries.length === 0) return undefined; + + const tools: ChatTool[] = []; + for (const [name, fn] of entries) { + if (!llm.isFunctionTool(fn)) continue; + tools.push({ + name, + description: fn.description, + parameters: llm.toJsonSchema(fn.parameters, false, false) as Record, + }); + } + return tools.length > 0 ? tools : undefined; +} + +/** + * Flatten a LiveKit `ChatContext` into Speko's `messages` array. System and + * developer items are emitted inline as `role: 'system'`. `FunctionCall` + * items become assistant messages with `toolCalls`; `FunctionCallOutput` + * items become `role: 'tool'` messages with `toolCallId`. Handoff items are + * skipped. Ordering is preserved. + * + * @public + */ +export function chatContextToSpeko(ctx: llm.ChatContext): SpekoChatMessage[] { + const messages: SpekoChatMessage[] = []; + + for (const item of ctx.items) { + if (item instanceof llm.ChatMessage) { + const text = extractText(item); + if (!text) continue; + + const role = + item.role === 'developer' + ? 'system' + : item.role === 'system' || item.role === 'user' || item.role === 'assistant' + ? item.role + : undefined; + if (role === undefined) continue; + + messages.push({ role, content: text }); + continue; + } + + if (item instanceof llm.FunctionCall) { + messages.push({ + role: 'assistant', + content: '', + toolCalls: [{ id: item.callId, name: item.name, args: item.args }], + }); + continue; + } + + if (item instanceof llm.FunctionCallOutput) { + messages.push({ + role: 'tool', + content: item.output, + toolCallId: item.callId, + ...(item.isError && { isError: true }), + }); + } + } + + return messages; +} + +function extractText(message: llm.ChatMessage): string { + const text = message.textContent; + return typeof text === 'string' ? text : ''; +} diff --git a/plugins/speko/src/stt.ts b/plugins/speko/src/stt.ts new file mode 100644 index 000000000..bb59ff026 --- /dev/null +++ b/plugins/speko/src/stt.ts @@ -0,0 +1,114 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { AudioBuffer } from '@livekit/agents'; +import { type APIConnectOptions, asLanguageCode, stt } from '@livekit/agents'; +import type { PipelineConstraints, Speko } from '@spekoai/sdk'; +import { framesToWav } from './audio.js'; +import { type SpekoClientOptions, createSpekoClient } from './client.js'; +import { type Intent, validateIntent } from './intent.js'; + +/** + * Options for the Speko STT component. + * + * @public + */ +export interface STTOptions extends SpekoClientOptions { + /** Routing hint sent with every transcription. */ + intent: Intent; + /** Optional allow-list constraints. */ + constraints?: PipelineConstraints; + /** + * Optional domain keywords forwarded to the underlying provider for + * vocabulary biasing. Casing is preserved for proper nouns. + */ + keywords?: readonly string[]; +} + +/** + * LiveKit Agents STT plugin that delegates recognition to the Speko proxy + * (`POST /v1/transcribe`). The Speko router picks the best STT provider per + * `(language, region, optimizeFor)` and handles failover. + * + * Declares `{ streaming: false }` because this plugin uploads one + * VAD-bounded WAV per recognition call. The underlying `/v1/transcribe` + * response streams transcript events, and the SDK aggregates the final result + * for `_recognize()`. `voice.AgentSession` wraps non-streaming STT plugins with + * `stt.StreamAdapter` automatically when a VAD is configured. + * + * @public + */ +export class STT extends stt.STT { + /** Human-readable model label used by LiveKit metrics and logs. */ + label = 'speko.STT'; + readonly #speko: Speko; + readonly #intent: Intent; + readonly #constraints: PipelineConstraints | undefined; + readonly #keywords: readonly string[] | undefined; + + constructor(options: STTOptions) { + super({ streaming: false, interimResults: false }); + validateIntent(options.intent); + this.#speko = createSpekoClient(options); + this.#intent = options.intent; + this.#constraints = options.constraints; + this.#keywords = options.keywords && options.keywords.length > 0 ? options.keywords : undefined; + } + + /** Provider identifier reported to LiveKit metrics. */ + override get provider(): string { + return 'speko'; + } + + /** Model identifier reported to LiveKit metrics. */ + override get model(): string { + return 'speko-router'; + } + + /** Recognize one VAD-bounded utterance by uploading it to Speko as WAV. */ + protected async _recognize( + frame: AudioBuffer, + abortSignal?: AbortSignal, + ): Promise { + const wav = framesToWav(frame); + const result = await this.#speko.transcribe( + wav, + { + language: this.#intent.language, + ...(this.#intent.region !== undefined && { region: this.#intent.region }), + ...(this.#intent.optimizeFor !== undefined && { + optimizeFor: this.#intent.optimizeFor, + }), + contentType: 'audio/wav', + ...(this.#constraints !== undefined && { constraints: this.#constraints }), + ...(this.#keywords !== undefined && { keywords: this.#keywords }), + }, + abortSignal, + ); + + return { + type: stt.SpeechEventType.FINAL_TRANSCRIPT, + alternatives: [ + { + text: result.text, + language: asLanguageCode(this.#intent.language), + startTime: 0, + endTime: 0, + confidence: result.confidence ?? 1, + }, + ], + }; + } + + /** + * Native microphone streaming is not supported; LiveKit wraps this STT with + * `stt.StreamAdapter` when used in `voice.AgentSession` with a VAD. + */ + override stream(_options?: { connOptions?: APIConnectOptions }): stt.SpeechStream { + throw new Error( + 'speko.STT does not support native microphone streaming; it uploads one VAD-bounded utterance. ' + + 'Pass it directly to `voice.AgentSession` with a VAD, or wrap it manually ' + + 'with `new stt.StreamAdapter(spekoStt, vad)` when implementing a custom STT node.', + ); + } +} diff --git a/plugins/speko/src/tts.ts b/plugins/speko/src/tts.ts new file mode 100644 index 000000000..b6c708914 --- /dev/null +++ b/plugins/speko/src/tts.ts @@ -0,0 +1,460 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { type APIConnectOptions, AudioByteStream, log, tts } from '@livekit/agents'; +import type { AudioFrame } from '@livekit/rtc-node'; +import type { + PipelineConstraints, + Speko, + SynthesizeResult, + SynthesizeStreamResult, +} from '@spekoai/sdk'; +import { parseWav, pcmSampleRateFromContentType } from './audio.js'; +import { type SpekoClientOptions, createSpekoClient } from './client.js'; +import { type Intent, validateIntent } from './intent.js'; + +/** + * Default output sample rate advertised to the LiveKit `AgentSession`. Speko's + * router pins the upstream provider to 24 kHz mono PCM (Cartesia's native + * format, ElevenLabs via `output_format=pcm_24000`). Any provider that emits + * `audio/mpeg` is rejected - v1 ships no MP3 decoder. + */ +const DEFAULT_SAMPLE_RATE = 24_000; +const NUM_CHANNELS = 1; + +/** + * Options for the Speko TTS component. + * + * @public + */ +export interface TTSOptions extends SpekoClientOptions { + /** Routing intent sent with every synthesis request. */ + intent: Intent; + /** Voice id override forwarded to the Speko proxy. */ + voice?: string; + /** Forwarded speech speed override. */ + speed?: number; + /** + * Output sample rate advertised to the LiveKit agent. Must match what the + * upstream provider actually emits, otherwise playback will be pitched. + * Defaults to 24000 (Cartesia Sonic default). + */ + sampleRate?: number; + /** Optional allow-list constraints. */ + constraints?: PipelineConstraints; +} + +/** + * LiveKit Agents TTS plugin that delegates synthesis to the Speko proxy + * (`POST /v1/synthesize`). The router picks the best TTS provider per intent + * and fails over automatically. + * + * The Speko REST response streams audio bytes. `voice.AgentSession` wraps + * non-streaming TTS plugins with `tts.StreamAdapter` automatically. + * + * **Audio format constraint**: the plugin accepts either `audio/pcm;rate=NNNN` + * or `audio/wav`. The Speko router asks every supported TTS for PCM upstream + * (Cartesia natively, ElevenLabs via `output_format=pcm_24000`), so MP3 should + * never reach the plugin in v1; if it does, `decodeSynthesisResult` throws. + * + * @public + */ +export class TTS extends tts.TTS { + /** Human-readable model label used by LiveKit metrics and logs. */ + label = 'speko.TTS'; + readonly #speko: Speko; + readonly #intent: Intent; + readonly #voice?: string; + readonly #speed?: number; + readonly #sampleRate: number; + readonly #constraints: PipelineConstraints | undefined; + + constructor(options: TTSOptions) { + validateIntent(options.intent); + const sampleRate = options.sampleRate ?? DEFAULT_SAMPLE_RATE; + super(sampleRate, NUM_CHANNELS, { streaming: false }); + this.#speko = createSpekoClient(options); + this.#intent = options.intent; + this.#voice = options.voice; + this.#speed = options.speed; + this.#sampleRate = sampleRate; + this.#constraints = options.constraints; + } + + /** Provider identifier reported to LiveKit metrics. */ + override get provider(): string { + return 'speko'; + } + + /** Model identifier reported to LiveKit metrics. */ + override get model(): string { + return 'speko-router'; + } + + /** Synthesize a single text segment through Speko's TTS router. */ + override synthesize( + text: string, + connOptions?: APIConnectOptions, + abortSignal?: AbortSignal, + ): tts.ChunkedStream { + return new ChunkedStream({ + text, + tts: this, + speko: this.#speko, + intent: this.#intent, + voice: this.#voice, + speed: this.#speed, + expectedSampleRate: this.#sampleRate, + constraints: this.#constraints, + connOptions, + abortSignal, + }); + } + + /** + * Native text-input streaming is not supported; LiveKit wraps this TTS with + * `tts.StreamAdapter` when used in `voice.AgentSession`. + */ + override stream(_options?: { connOptions?: APIConnectOptions }): tts.SynthesizeStream { + throw new Error( + 'speko.TTS does not support native text-input streaming; it synthesizes one sentence request at a time. ' + + 'Pass it directly to `voice.AgentSession`, or wrap it manually with ' + + '`new tts.StreamAdapter(spekoTts, sentenceTokenizer)` when implementing a custom TTS node.', + ); + } +} + +interface ChunkedStreamArgs { + text: string; + tts: TTS; + speko: Speko; + intent: Intent; + voice?: string; + speed?: number; + expectedSampleRate: number; + constraints?: PipelineConstraints; + connOptions?: APIConnectOptions; + abortSignal?: AbortSignal; +} + +/** + * Chunked TTS stream that forwards sentence-sized synthesis requests to Speko. + * + * @public + */ +class ChunkedStream extends tts.ChunkedStream { + label = 'speko.ChunkedStream'; + readonly #speko: Speko; + readonly #intent: Intent; + readonly #voice?: string; + readonly #speed?: number; + readonly #expectedSampleRate: number; + readonly #constraints: PipelineConstraints | undefined; + + constructor(args: ChunkedStreamArgs) { + super(args.text, args.tts, args.connOptions, args.abortSignal); + this.#speko = args.speko; + this.#intent = args.intent; + this.#voice = args.voice; + this.#speed = args.speed; + this.#expectedSampleRate = args.expectedSampleRate; + this.#constraints = args.constraints; + } + + protected async run(): Promise { + // Diagnostic logging is intentionally verbose around the synthesize + // boundary because the LiveKit Agents framework emits "TTS stream + // stalled after producing audio, forcing close" with zero context + // about which sentence stalled or what content-type came back. With + // these logs we can grep the worker container for `[speko.TTS]` and + // see the full timeline per turn. + const logger = log(); + const requestId = crypto.randomUUID(); + const t0 = Date.now(); + logger.info( + { + requestId, + textLength: this.inputText.length, + textPreview: this.inputText.slice(0, 80), + voice: this.#voice, + language: this.#intent.language, + optimizeFor: this.#intent.optimizeFor, + constraints: this.#constraints, + expectedSampleRate: this.#expectedSampleRate, + }, + '[speko.TTS] synthesize:start', + ); + + let streamed: SynthesizeStreamResult; + try { + streamed = await this.#speko.synthesizeStream( + this.inputText, + { + language: this.#intent.language, + ...(this.#intent.region !== undefined && { region: this.#intent.region }), + ...(this.#intent.optimizeFor !== undefined && { + optimizeFor: this.#intent.optimizeFor, + }), + ...(this.#voice !== undefined && { voice: this.#voice }), + ...(this.#speed !== undefined && { speed: this.#speed }), + ...(this.#constraints !== undefined && { constraints: this.#constraints }), + }, + this.abortSignal, + ); + } catch (err) { + logger.error( + { + requestId, + elapsedMs: Date.now() - t0, + error: err instanceof Error ? err.message : String(err), + }, + '[speko.TTS] synthesize:error', + ); + throw err; + } + + const t1 = Date.now(); + if (streamed.contentType.toLowerCase().startsWith('audio/pcm')) { + await this.#streamPcmResult(streamed, requestId, t0, t1); + return; + } + + const chunks: Uint8Array[] = []; + for await (const chunk of streamed) chunks.push(chunk); + const result: SynthesizeResult = { + audio: concatChunks(chunks), + contentType: streamed.contentType, + provider: streamed.provider, + model: streamed.model, + failoverCount: streamed.failoverCount, + scoresRunId: streamed.scoresRunId, + }; + logger.info( + { + requestId, + elapsedMs: t1 - t0, + contentType: result.contentType, + audioBytes: result.audio.byteLength, + provider: result.provider, + }, + '[speko.TTS] synthesize:response', + ); + + const { pcm, sampleRate, channels } = decodeSynthesisResult(result); + + if (sampleRate !== this.#expectedSampleRate) { + logger.error( + { + requestId, + actualSampleRate: sampleRate, + expectedSampleRate: this.#expectedSampleRate, + }, + '[speko.TTS] synthesize:sample-rate-mismatch', + ); + throw new Error( + `speko.TTS: provider returned audio at ${sampleRate} Hz but the TTS was ` + + `configured for ${this.#expectedSampleRate} Hz. Either set ` + + `\`sampleRate: ${sampleRate}\` on speko.TTS or pin the Speko router to a ` + + `provider that matches the expected rate.`, + ); + } + + const samplesPerFrame = Math.round(sampleRate / 50); + const bstream = new AudioByteStream(sampleRate, channels, samplesPerFrame); + const frames = [...bstream.write(pcm), ...bstream.flush()]; + + if (frames.length === 0) { + logger.error({ requestId }, '[speko.TTS] synthesize:empty-frames'); + throw new Error('speko.TTS: provider returned empty audio'); + } + + logger.info( + { + requestId, + frameCount: frames.length, + sampleRate, + channels, + pcmBytes: pcm.byteLength, + durationMs: Math.round((pcm.byteLength / 2 / sampleRate) * 1000), + decodeMs: Date.now() - t1, + }, + '[speko.TTS] synthesize:frames-ready', + ); + + this.#pushFrames(frames, requestId); + + logger.info({ requestId, totalElapsedMs: Date.now() - t0 }, '[speko.TTS] synthesize:done'); + } + + async #streamPcmResult( + streamed: SynthesizeStreamResult, + requestId: string, + startedAt: number, + responseAt: number, + ): Promise { + const logger = log(); + const sampleRate = pcmSampleRateFromContentType( + streamed.contentType.toLowerCase(), + this.#expectedSampleRate, + ); + if (sampleRate !== this.#expectedSampleRate) { + throw new Error( + `speko.TTS: provider returned audio at ${sampleRate} Hz but the TTS was ` + + `configured for ${this.#expectedSampleRate} Hz.`, + ); + } + + const samplesPerFrame = Math.round(sampleRate / 50); + const bstream = new AudioByteStream(sampleRate, NUM_CHANNELS, samplesPerFrame); + let pending: AudioFrame | undefined; + let pushed = 0; + let bytes = 0; + let firstFrameMs: number | undefined; + const flush = (final: boolean) => { + if (!pending) return; + this.queue.put({ + requestId, + segmentId: requestId, + frame: pending, + final, + }); + pending = undefined; + pushed += 1; + firstFrameMs ??= Date.now() - startedAt; + }; + + for await (const chunk of streamed) { + bytes += chunk.byteLength; + for (const frame of bstream.write(chunk)) { + flush(false); + pending = frame; + } + } + for (const frame of bstream.flush()) { + flush(false); + pending = frame; + } + flush(true); + + if (pushed === 0) { + logger.error({ requestId }, '[speko.TTS] synthesize:empty-frames'); + throw new Error('speko.TTS: provider returned empty audio'); + } + + logger.info( + { + requestId, + responseMs: responseAt - startedAt, + firstFrameMs, + totalElapsedMs: Date.now() - startedAt, + frameCount: pushed, + pcmBytes: bytes, + provider: streamed.provider, + }, + '[speko.TTS] synthesize:streamed-pcm-done', + ); + } + + #pushFrames(frames: AudioFrame[], requestId: string): void { + const logger = log(); + const t0 = Date.now(); + let pushed = 0; + let pending: AudioFrame | undefined; + const flush = (final: boolean) => { + if (!pending) return; + this.queue.put({ + requestId, + segmentId: requestId, + frame: pending, + final, + }); + pending = undefined; + pushed += 1; + }; + + for (const frame of frames) { + flush(false); + pending = frame; + } + flush(true); + + logger.info( + { + requestId, + pushedCount: pushed, + expectedCount: frames.length, + pushMs: Date.now() - t0, + }, + '[speko.TTS] pushFrames:done', + ); + } +} + +function concatChunks(chunks: readonly Uint8Array[]): Uint8Array { + const total = chunks.reduce((sum, chunk) => sum + chunk.byteLength, 0); + const out = new Uint8Array(total); + let offset = 0; + for (const chunk of chunks) { + out.set(chunk, offset); + offset += chunk.byteLength; + } + return out; +} + +/** + * Decode a `SynthesizeResult` into raw PCM + sample rate + channel count. + * Branches on `contentType`: + * + * - `audio/pcm;rate=NNNN` returns the raw payload, with the rate parsed from MIME parameters. + * Cartesia's contract is mono, so channels is pinned to one channel. + * - `audio/wav` / `audio/x-wav` returns the WAV body after `parseWav` strips the header. The + * embedded channel count is validated - v1 only handles mono, and a stereo + * response would otherwise be fed to a mono `AudioByteStream` and played at + * half speed with L/R mixed. + * - `audio/mpeg` or anything else throws, documented v1 limitation. + * + * Exported for unit testing. + * + * @public + */ +export function decodeSynthesisResult(result: SynthesizeResult): { + pcm: Uint8Array; + sampleRate: number; + channels: number; +} { + const contentType = result.contentType.toLowerCase(); + + if (contentType.startsWith('audio/pcm')) { + return { + pcm: result.audio, + sampleRate: pcmSampleRateFromContentType(contentType, DEFAULT_SAMPLE_RATE), + channels: NUM_CHANNELS, + }; + } + + if (contentType.startsWith('audio/wav') || contentType.startsWith('audio/x-wav')) { + const { pcm, sampleRate, channels } = parseWav(result.audio); + if (channels !== NUM_CHANNELS) { + throw new Error( + `speko.TTS: WAV response has ${channels} channels but the plugin is ` + + `configured for ${NUM_CHANNELS}. Configure the Speko router to return ` + + `mono audio, or pin a mono-only provider.`, + ); + } + return { pcm, sampleRate, channels }; + } + + if (contentType.startsWith('audio/mpeg')) { + throw new Error( + `speko.TTS: received ${result.contentType} from provider "${result.provider}". ` + + 'v1 only supports raw PCM (`audio/pcm;rate=NNNN`) and WAV (`audio/wav`). ' + + 'Configure your Speko routing intent so Cartesia is preferred, or pin the ' + + 'TTS provider explicitly.', + ); + } + + throw new Error( + `speko.TTS: unsupported content type "${result.contentType}" from provider ` + + `"${result.provider}". Expected audio/pcm, audio/wav, or (in future) audio/mpeg.`, + ); +} diff --git a/plugins/speko/tsconfig.json b/plugins/speko/tsconfig.json new file mode 100644 index 000000000..cbe5763f7 --- /dev/null +++ b/plugins/speko/tsconfig.json @@ -0,0 +1,15 @@ +{ + "extends": "../../tsconfig.json", + "include": ["./src"], + "compilerOptions": { + // match output dir to input dir. e.g. dist/index instead of dist/src/index + "rootDir": "./src", + "declarationDir": "./dist", + "outDir": "./dist" + }, + "typedocOptions": { + "name": "plugins/agents-plugin-speko", + "entryPointStrategy": "resolve", + "entryPoints": ["src/index.ts"] + } +} diff --git a/plugins/speko/tsup.config.ts b/plugins/speko/tsup.config.ts new file mode 100644 index 000000000..b491713a4 --- /dev/null +++ b/plugins/speko/tsup.config.ts @@ -0,0 +1,6 @@ +import { defineConfig } from 'tsup'; +import defaults from '../../tsup.config'; + +export default defineConfig({ + ...defaults, +}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 51c33a91e..3e529e495 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1243,6 +1243,34 @@ importers: specifier: ^5.0.0 version: 5.9.3 + plugins/speko: + dependencies: + '@spekoai/sdk': + specifier: ^0.4.1 + version: 0.4.1 + devDependencies: + '@livekit/agents': + specifier: workspace:* + version: link:../../agents + '@livekit/agents-plugin-silero': + specifier: workspace:* + version: link:../silero + '@livekit/agents-plugins-test': + specifier: workspace:* + version: link:../test + '@livekit/rtc-node': + specifier: 'catalog:' + version: 0.13.27 + '@microsoft/api-extractor': + specifier: ^7.35.0 + version: 7.43.7(@types/node@25.6.0) + tsup: + specifier: ^8.3.5 + version: 8.4.0(@microsoft/api-extractor@7.43.7(@types/node@25.6.0))(postcss@8.5.9)(tsx@4.21.0)(typescript@5.9.3) + typescript: + specifier: ^5.0.0 + version: 5.9.3 + plugins/tavus: dependencies: livekit-server-sdk: @@ -2744,6 +2772,9 @@ packages: '@sinclair/typebox@0.27.8': resolution: {integrity: sha512-+Fj43pSMwJs4KRrH/938Uf+uAELIgVBmQzg/q1YG10djyfA3TnrU8N8XzqCh/okZdszqBQTZf96idMfE5lnwTA==} + '@spekoai/sdk@0.4.1': + resolution: {integrity: sha512-YAYu7fVDpzIgR7vCt03p9XImhmLMhXviSpQEnYXDzyMhL81P04ovcTw7/0FBJ+xjDi7w+CrCBR3bDyzqXI6mIQ==} + '@standard-schema/spec@1.1.0': resolution: {integrity: sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==} @@ -6862,6 +6893,10 @@ snapshots: '@sinclair/typebox@0.27.8': {} + '@spekoai/sdk@0.4.1': + dependencies: + tslib: 2.6.2 + '@standard-schema/spec@1.1.0': {} '@trivago/prettier-plugin-sort-imports@4.3.0(prettier@3.2.5)': diff --git a/turbo.json b/turbo.json index 7fd515bc2..b03435e86 100644 --- a/turbo.json +++ b/turbo.json @@ -74,6 +74,8 @@ "RUNWAY_AVATAR_ID", "RUNWAY_AVATAR_PRESET_ID", "SARVAM_API_KEY", + "SPEKO_API_KEY", + "SPEKO_BASE_URL", "SIP_PARTICIPANT_IDENTITY", "SIP_PHONE_NUMBER", "LK_OPENAI_DEBUG",