Skip to content

Commit 77cdcb5

Browse files
Merge pull request #183 from jamesrochabrun/feature/realtime-fixes-and-audio-improvements
Fix Realtime API issues and audio engine crash
2 parents fe1bc41 + 296f1f5 commit 77cdcb5

4 files changed

Lines changed: 94 additions & 24 deletions

File tree

Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ class MicrophonePCMSampleVendorAE: MicrophonePCMSampleVendor {
6262
logger.debug("MicrophonePCMSampleVendorAE is being freed")
6363
}
6464

65-
public func start() throws -> AsyncStream<AVAudioPCMBuffer> {
65+
func start() throws -> AsyncStream<AVAudioPCMBuffer> {
6666
guard
6767
let desiredTapFormat = AVAudioFormat(
6868
commonFormat: .pcmFormatInt16,
@@ -87,14 +87,10 @@ class MicrophonePCMSampleVendorAE: MicrophonePCMSampleVendor {
8787
return AsyncStream<AVAudioPCMBuffer> { [weak self] continuation in
8888
guard let this = self else { return }
8989
this.continuation = continuation
90-
this.inputNode.installTap(onBus: 0, bufferSize: targetBufferSize, format: desiredTapFormat) { [weak this] sampleBuffer, _ in
91-
if let accumulatedBuffer = this?.microphonePCMSampleVendorCommon.resampleAndAccumulate(sampleBuffer) {
92-
// If the buffer has accumulated to a sufficient level, give it back to the caller
93-
Task { @RealtimeActor in
94-
this?.continuation?.yield(accumulatedBuffer)
95-
}
96-
}
97-
}
90+
this.installTapNonIsolated(
91+
inputNode: this.inputNode,
92+
bufferSize: targetBufferSize,
93+
format: desiredTapFormat)
9894
}
9995
}
10096

@@ -111,5 +107,22 @@ class MicrophonePCMSampleVendorAE: MicrophonePCMSampleVendor {
111107
private let microphonePCMSampleVendorCommon = MicrophonePCMSampleVendorCommon()
112108
private var continuation: AsyncStream<AVAudioPCMBuffer>.Continuation?
113109

110+
private nonisolated func installTapNonIsolated(
111+
inputNode: AVAudioInputNode,
112+
bufferSize: AVAudioFrameCount,
113+
format: AVAudioFormat)
114+
{
115+
inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: format) { [weak self] sampleBuffer, _ in
116+
guard let self else { return }
117+
Task { await self.processBuffer(sampleBuffer) }
118+
}
119+
}
120+
121+
private func processBuffer(_ buffer: AVAudioPCMBuffer) {
122+
if let accumulatedBuffer = microphonePCMSampleVendorCommon.resampleAndAccumulate(buffer) {
123+
continuation?.yield(accumulatedBuffer)
124+
}
125+
}
126+
114127
}
115128
#endif

Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,62 @@ open class OpenAIRealtimeSession {
273273
logger.warning("Received response.done with unexpected format")
274274
}
275275

276+
case "response.text.delta":
277+
if let delta = json["delta"] as? String {
278+
continuation?.yield(.responseTextDelta(delta))
279+
}
280+
281+
case "response.text.done":
282+
if let text = json["text"] as? String {
283+
continuation?.yield(.responseTextDone(text))
284+
}
285+
286+
case "response.output_item.added":
287+
if
288+
let item = json["item"] as? [String: Any],
289+
let itemId = item["id"] as? String,
290+
let type = item["type"] as? String
291+
{
292+
continuation?.yield(.responseOutputItemAdded(itemId: itemId, type: type))
293+
}
294+
295+
case "response.output_item.done":
296+
if
297+
let item = json["item"] as? [String: Any],
298+
let itemId = item["id"] as? String,
299+
let type = item["type"] as? String
300+
{
301+
let content = item["content"] as? [[String: Any]]
302+
continuation?.yield(.responseOutputItemDone(itemId: itemId, type: type, content: content))
303+
}
304+
305+
case "response.content_part.added":
306+
if
307+
let part = json["part"] as? [String: Any],
308+
let type = part["type"] as? String
309+
{
310+
continuation?.yield(.responseContentPartAdded(type: type))
311+
}
312+
313+
case "response.content_part.done":
314+
if
315+
let part = json["part"] as? [String: Any],
316+
let type = part["type"] as? String
317+
{
318+
let text = part["text"] as? String
319+
continuation?.yield(.responseContentPartDone(type: type, text: text))
320+
}
321+
322+
case "conversation.item.created":
323+
if
324+
let item = json["item"] as? [String: Any],
325+
let itemId = item["id"] as? String,
326+
let type = item["type"] as? String
327+
{
328+
let role = item["role"] as? String
329+
continuation?.yield(.conversationItemCreated(itemId: itemId, type: type, role: role))
330+
}
331+
276332
default:
277333
// Log unhandled message types with more detail for debugging
278334
logger.warning("⚠️ Unhandled message type: \(messageType)")

Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
/// Realtime session configuration
1212
/// https://platform.openai.com/docs/api-reference/realtime-client-events/session/update#realtime-client-events/session/update-session
1313
public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendable {
14-
1514
public init(
1615
inputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil,
1716
inputAudioTranscription: OpenAIRealtimeSessionConfiguration.InputAudioTranscription? = nil,
@@ -41,7 +40,6 @@ public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendable {
4140
}
4241

4342
public enum ToolChoice: Encodable, Sendable {
44-
4543
/// The model will not call any tool and instead generates a message.
4644
/// This is the default when no tools are present in the request body
4745
case none
@@ -73,22 +71,14 @@ public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendable {
7371
case .specific(let functionName):
7472
var container = encoder.container(keyedBy: RootKey.self)
7573
try container.encode("function", forKey: .type)
76-
var functionContainer = container.nestedContainer(
77-
keyedBy: FunctionKey.self,
78-
forKey: .function)
79-
try functionContainer.encode(functionName, forKey: .name)
74+
try container.encode(functionName, forKey: .name)
8075
}
8176
}
8277

8378
private enum RootKey: CodingKey {
8479
case type
85-
case function
86-
}
87-
88-
private enum FunctionKey: CodingKey {
8980
case name
9081
}
91-
9282
}
9383

9484
/// The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
@@ -157,7 +147,6 @@ public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendable {
157147
case turnDetection = "turn_detection"
158148
case voice
159149
}
160-
161150
}
162151

163152
// MARK: OpenAIRealtimeSessionConfiguration.InputAudioTranscription
@@ -238,7 +227,6 @@ extension OpenAIRealtimeSessionConfiguration {
238227

239228
extension OpenAIRealtimeSessionConfiguration {
240229
public struct TurnDetection: Encodable, Sendable {
241-
242230
public init(
243231
type: DetectionType)
244232
{
@@ -270,7 +258,6 @@ extension OpenAIRealtimeSessionConfiguration {
270258
case type
271259
case eagerness
272260
}
273-
274261
}
275262
}
276263

@@ -321,6 +308,5 @@ extension OpenAIRealtimeSessionConfiguration.TurnDetection {
321308
case medium
322309
case high
323310
}
324-
325311
}
326312
}

Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,19 @@ public enum OpenAIRealtimeMessage: Sendable {
2929

3030
/// Response completion with potential errors
3131
case responseDone(status: String, statusDetails: [String: Any]?) // "response.done"
32+
33+
// Text streaming (for text-only responses)
34+
case responseTextDelta(String) // "response.text.delta"
35+
case responseTextDone(String) // "response.text.done"
36+
37+
// Output item lifecycle
38+
case responseOutputItemAdded(itemId: String, type: String) // "response.output_item.added"
39+
case responseOutputItemDone(itemId: String, type: String, content: [[String: Any]]?) // "response.output_item.done"
40+
41+
// Content part lifecycle
42+
case responseContentPartAdded(type: String) // "response.content_part.added"
43+
case responseContentPartDone(type: String, text: String?) // "response.content_part.done"
44+
45+
/// Conversation item
46+
case conversationItemCreated(itemId: String, type: String, role: String?) // "conversation.item.created"
3247
}

0 commit comments

Comments
 (0)