diff --git a/agents/src/voice/background_audio.ts b/agents/src/voice/background_audio.ts index bb2ffbfc5..4d5cecf87 100644 --- a/agents/src/voice/background_audio.ts +++ b/agents/src/voice/background_audio.ts @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 import { AudioFrame, + AudioMixer, AudioSource, LocalAudioTrack, type LocalTrackPublication, @@ -57,7 +58,7 @@ export interface BackgroundAudioPlayerOptions { /** * Sound to play when the agent is thinking. - * TODO (Brian): Implement thinking sound when AudioMixer becomes available + * Plays when agent state changes to 'thinking' and stops when it changes to other states. */ thinkingSound?: AudioSourceType | AudioConfig | AudioConfig[]; @@ -113,15 +114,16 @@ export class PlayHandle { * This class handles playing ambient sounds and manages audio track publishing. * It supports: * - Continuous ambient sound playback with looping + * - Thinking sound playback during agent processing + * - Multiple simultaneous audio streams via AudioMixer * - Volume control and probability-based sound selection * - Integration with LiveKit rooms and agent sessions * - * Note: Thinking sound not yet supported - * * @example * ```typescript * const player = new BackgroundAudioPlayer({ * ambientSound: { source: BuiltinAudioClip.OFFICE_AMBIENCE, volume: 0.8 }, + * thinkingSound: { source: BuiltinAudioClip.KEYBOARD_TYPING, volume: 0.6 }, * }); * * await player.start({ room, agentSession }); @@ -130,9 +132,12 @@ export class PlayHandle { export class BackgroundAudioPlayer { private ambientSound?: AudioSourceType | AudioConfig | AudioConfig[]; private thinkingSound?: AudioSourceType | AudioConfig | AudioConfig[]; + private streamTimeoutMs: number; private playTasks: Task[] = []; private audioSource = new AudioSource(48000, 1, AUDIO_SOURCE_BUFFER_MS); + private audioMixer: AudioMixer; + private mixerTask?: Task; private room?: Room; private agentSession?: AgentSession; @@ -143,20 +148,24 @@ export class BackgroundAudioPlayer { private ambientHandle?: PlayHandle; private thinkingHandle?: PlayHandle; + private closed = true; + // TODO (Brian): add lock #logger = log(); constructor(options?: BackgroundAudioPlayerOptions) { - const { ambientSound, thinkingSound } = options || {}; + const { ambientSound, thinkingSound, streamTimeoutMs = 200 } = options || {}; this.ambientSound = ambientSound; this.thinkingSound = thinkingSound; + this.streamTimeoutMs = streamTimeoutMs; - if (this.thinkingSound) { - this.#logger.warn('thinkingSound is not yet supported'); - // TODO: Implement thinking sound when AudioMixer becomes available - } + this.audioMixer = new AudioMixer(48000, 1, { + blocksize: 4800, + capacity: 1, + streamTimeoutMs: this.streamTimeoutMs, + }); } /** @@ -278,11 +287,25 @@ export class BackgroundAudioPlayer { this.agentSession = agentSession; this.trackPublishOptions = trackPublishOptions; + this.closed = false; + await this.publishTrack(); // TODO (Brian): check job context is not fake - // TODO (Brian): start audio mixer task + // Start audio mixer task + this.mixerTask = Task.from(async () => { + try { + await this.runMixerTask(); + } catch (err) { + if (this.closed) { + // expected (AudioSource is closed) + return; + } + throw err; + } + }); + this.room.on('reconnected', this.onReconnected); this.agentSession?.on(AgentSessionEventTypes.AgentStateChanged, this.onAgentStateChanged); @@ -301,16 +324,21 @@ export class BackgroundAudioPlayer { * Close and cleanup the background audio system */ async close(): Promise { + this.closed = true; await cancelAndWait(this.playTasks, TASK_TIMEOUT_MS); if (this.republishTask) { await this.republishTask.cancelAndWait(TASK_TIMEOUT_MS); } - // TODO (Brian): cancel audio mixer task and close audio mixer - + // Cancel audio mixer task and close audio mixer + await this.audioMixer.aclose(); await this.audioSource.close(); + if (this.mixerTask) { + await this.mixerTask.cancelAndWait(TASK_TIMEOUT_MS); + } + this.agentSession?.off(AgentSessionEventTypes.AgentStateChanged, this.onAgentStateChanged); this.room?.off('reconnected', this.onReconnected); @@ -372,12 +400,43 @@ export class BackgroundAudioPlayer { return; } - // TODO (Brian): play thinking sound and assign to thinkingHandle + const normalized = this.normalizeSoundSource(this.thinkingSound); + if (normalized) { + const { source, volume } = normalized; + const selectedSound: AudioConfig = { source, volume, probability: 1.0 }; + this.thinkingHandle = this.play(selectedSound); + } } else { this.thinkingHandle?.stop(); } }; + private applyVolumeToFrame(frame: AudioFrame, volume: number): AudioFrame { + const int16Data = new Int16Array( + frame.data.buffer, + frame.data.byteOffset, + frame.data.byteLength / 2, + ); + const float32Data = new Float32Array(int16Data.length); + + for (let i = 0; i < int16Data.length; i++) { + float32Data[i] = int16Data[i]!; + } + + const volumeFactor = 10 ** Math.log10(volume); + for (let i = 0; i < float32Data.length; i++) { + float32Data[i]! *= volumeFactor; + } + + const outputData = new Int16Array(float32Data.length); + for (let i = 0; i < float32Data.length; i++) { + const clipped = Math.max(-32768, Math.min(32767, float32Data[i]!)); + outputData[i] = Math.round(clipped); + } + + return new AudioFrame(outputData, frame.sampleRate, frame.channels, frame.samplesPerChannel); + } + private async playTask({ playHandle, sound, @@ -395,57 +454,44 @@ export class BackgroundAudioPlayer { sound = getBuiltinAudioPath(sound); } + let audioStream: AsyncIterable; if (typeof sound === 'string') { - sound = loop + audioStream = loop ? loopAudioFramesFromFile(sound, { abortSignal: signal }) : audioFramesFromFile(sound, { abortSignal: signal }); + } else { + audioStream = sound; } - try { - for await (const frame of sound) { - if (signal.aborted || playHandle.done()) break; + const applyVolume = this.applyVolumeToFrame.bind(this); - let processedFrame: AudioFrame; - - if (volume !== 1.0) { - const int16Data = new Int16Array( - frame.data.buffer, - frame.data.byteOffset, - frame.data.byteLength / 2, - ); - const float32Data = new Float32Array(int16Data.length); - - for (let i = 0; i < int16Data.length; i++) { - float32Data[i] = int16Data[i]!; - } - - const volumeFactor = 10 ** Math.log10(volume); - for (let i = 0; i < float32Data.length; i++) { - float32Data[i]! *= volumeFactor; - } - - const outputData = new Int16Array(float32Data.length); - for (let i = 0; i < float32Data.length; i++) { - const clipped = Math.max(-32768, Math.min(32767, float32Data[i]!)); - outputData[i] = Math.round(clipped); - } - - processedFrame = new AudioFrame( - outputData, - frame.sampleRate, - frame.channels, - frame.samplesPerChannel, - ); - } else { - processedFrame = frame; - } - - // TODO (Brian): use AudioMixer to add/remove frame streams - await this.audioSource.captureFrame(processedFrame); + async function* genWrapper(): AsyncGenerator { + for await (const frame of audioStream) { + if (signal.aborted || playHandle.done()) break; + yield volume !== 1.0 ? applyVolume(frame, volume) : frame; } + // TODO: the waitForPlayout() may be inaccurate by 400ms + playHandle._markPlayoutDone(); + } + + const gen = genWrapper(); + try { + this.audioMixer.addStream(gen); + await playHandle.waitForPlayout(); // wait for playout or interruption } finally { - // TODO: the waitForPlayout() may be innaccurate by 400ms + this.audioMixer.removeStream(gen); playHandle._markPlayoutDone(); + + // Close the generator if it was stopped early + if (playHandle.done()) { + await gen.return(undefined); + } + } + } + + private async runMixerTask(): Promise { + for await (const frame of this.audioMixer) { + await this.audioSource.captureFrame(frame); } } } diff --git a/examples/src/background_audio.ts b/examples/src/background_audio.ts index 4c229fb48..857521150 100644 --- a/examples/src/background_audio.ts +++ b/examples/src/background_audio.ts @@ -12,7 +12,10 @@ import { z } from 'zod'; * Background audio could make the agent feel more realistic, versus perfect silence * in the background. * - * NOTE: Thinking sound is not yet supported (requires AudioMixer implementation) + * This example demonstrates: + * - Ambient background sound (office ambience) playing continuously + * - Thinking sound (keyboard typing) that plays when the agent is processing/thinking + * - Multiple sounds can play simultaneously via AudioMixer */ export default defineAgent({ @@ -47,11 +50,12 @@ export default defineAgent({ const backgroundAudio = new voice.BackgroundAudioPlayer({ ambientSound: voice.BuiltinAudioClip.OFFICE_AMBIENCE, - // TODO: Thinking sound not yet supported - // thinkingSound: [ - // { source: voice.BuiltinAudioClip.KEYBOARD_TYPING, volume: 0.8 }, - // { source: voice.BuiltinAudioClip.KEYBOARD_TYPING2, volume: 0.7 }, - // ], + // Thinking sound will play when the agent enters 'thinking' state (e.g., during tool calls) + // Multiple sounds with different probabilities/volumes can be provided + thinkingSound: [ + { source: voice.BuiltinAudioClip.KEYBOARD_TYPING, volume: 0.8, probability: 0.6 }, + { source: voice.BuiltinAudioClip.KEYBOARD_TYPING2, volume: 0.7, probability: 0.4 }, + ], }); await backgroundAudio.start({ room: ctx.room, agentSession: session });